diff --git a/CHANGELOG b/CHANGELOG index 565331646..2c372ed82 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -26,6 +26,7 @@ Motions: - Bugfix: Added more distance in motion PDF for DEL-tags in new lines [#3211]. - Added warning message if an edit dialog was already opened by another client [#3212]. +- Reworked DOCX export parser and added comments to DOCX [#3258]. Users: - User without permission to see users can now see agenda item speakers, diff --git a/openslides/core/static/js/core/docx.js b/openslides/core/static/js/core/docx.js new file mode 100644 index 000000000..7ec74b3bd --- /dev/null +++ b/openslides/core/static/js/core/docx.js @@ -0,0 +1,356 @@ +(function () { + +'use strict'; + +angular.module('OpenSlidesApp.core.docx', []) + +.factory('Html2DocxConverter', [ + '$q', + 'ImageConverter', + function ($q, ImageConverter) { + var PAGEBREAK = ''; + + var createInstance = function () { + var converter = { + imageMap: {}, + documentImages: [], + relationships: [], + contentTypes: [], + }; + + var html2docx = function (html) { + var docx = ''; + var tagStack = []; + + // With this variable, we keep track, if we are currently inside or outside of a paragraph. + var inParagraph = true; + // the text may not begin with a paragraph. If so, append one because word needs it. + var skipFirstParagraphClosing = true; + + var handleTag = function (tag) { + if (tag.charAt(0) == "/") { // A closing tag + // remove from stack + tagStack.pop(); + + // Special: end paragraphs + if (tag.startsWith('/p')) { + docx += ''; + inParagraph = false; + } + } else { // now all other tags + var tagname = tag.split(' ')[0]; + handleNamedTag(tagname, tag); + } + return docx; + }; + var handleNamedTag = function (tagname, fullTag) { + var tag = { + tag: tagname, + attrs: {}, + }; + switch (tagname) { + case 'p': + if (inParagraph && !skipFirstParagraphClosing) { + // End the paragrapth, if there is one + docx += ''; + } + skipFirstParagraphClosing = false; + docx += ''; + inParagraph = true; + break; + case 'span': + var styleRegex = /(?:\"|\;\s?)([a-zA-z\-]+)\:\s?([a-zA-Z0-9\-\#]+)/g, matchSpan; + while ((matchSpan = styleRegex.exec(fullTag)) !== null) { + switch (matchSpan[1]) { + case 'color': + tag.attrs.color = matchSpan[2].slice(1); // cut off the # + break; + case 'background-color': + tag.attrs.backgroundColor = matchSpan[2].slice(1); // cut off the # + break; + case 'text-decoration': + if (matchSpan[2] === 'underline') { + tag.attrs.underline = true; + } else if (matchSpan[2] === 'line-through') { + tag.attrs.strike = true; + } + break; + } + } + break; + case 'a': + var hrefRegex = /href="([^"]+)"/g; + var href = hrefRegex.exec(fullTag)[1]; + tag.href = href; + break; + case 'img': + imageTag(tag, fullTag); + break; + } + if (tagname !== 'img' && tagname !== 'p') { + tagStack.push(tag); + } + }; + var imageTag = function (tag, fullTag) { + // images has to be placed instantly, so there is no use of 'tag'. + var image = {}; + var attributeRegex = /(\w+)=\"([^\"]*)\"/g, attributeMatch; + while ((attributeMatch = attributeRegex.exec(fullTag)) !== null) { + image[attributeMatch[1]] = attributeMatch[2]; + } + if (image.src && converter.imageMap[image.src]) { + image.width = converter.imageMap[image.src].width; + image.height = converter.imageMap[image.src].height; + + var rrId = converter.relationships.length + 1; + var imageId = converter.documentImages.length + 1; + + // set name ('pic.jpg'), title, ext ('jpg'), mime ('image/jpeg') + image.name = _.last(image.src.split('/')); + + var tmp = image.name.split('.'); + image.ext = tmp.splice(-1); + + // set name without extension as title if there isn't a title + if (!image.title) { + image.title = tmp.join('.'); + } + + image.mime = 'image/' + image.ext; + if (image.ext == 'jpe' || image.ext == 'jpg') { + image.mime = 'image/jpeg'; + } + + // x and y for the container and picture size in EMU (assuming 96dpi)! + var x = image.width * 914400 / 96; + var y = image.height * 914400 / 96; + + // the image does not belong into a paragraph in ooxml + if (inParagraph) { + docx += ''; + } + docx += '' + + '' + + '' + + '' + + '' + + '' + + ''; + + // inParagraph stays untouched, the documents paragraph state is restored here + if (inParagraph) { + docx += ''; + } + + // entries in documentImages, relationships and contentTypes + converter.documentImages.push({ + src: image.src, + zipPath: 'word/media/' + image.name + }); + converter.relationships.push({ + Id: 'rrId' + rrId, + Type: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image', + Target: 'media/' + image.name + }); + converter.contentTypes.push({ + PartName: '/word/media/' + image.name, + ContentType: image.mime + }); + } + }; + var handleText = function (text) { + // Start a new paragraph, if only loose text is there + if (!inParagraph) { + docx += ''; + inParagraph = true; + } + var docxPart = ''; + var hyperlink = false; + tagStack.forEach(function (tag) { + switch (tag.tag) { + case 'b': + case 'strong': + docxPart += ''; + break; + case 'em': + case 'i': + docxPart += ''; + break; + case 'span': + for (var key in tag.attrs) { + switch (key) { + case 'color': + docxPart += ''; + break; + case 'backgroundColor': + docxPart += ''; + break; + case 'underline': + docxPart += ''; + break; + case 'strike': + docxPart += ''; + break; + } + } + break; + case 'u': + docxPart += ''; + break; + case 'strike': + docxPart += ''; + break; + case 'a': + var id = converter.relationships.length + 1; + docxPart = '' + docxPart; + docxPart += ''; + converter.relationships.push({ + Id: 'rrId' + id, + Type: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink', + Target: tag.href, + TargetMode: 'External' + }); + hyperlink = true; + break; + } + }); + docxPart += '' + text + ''; + if (hyperlink) { + docxPart += ''; + } + + // append to docx + docx += docxPart; + return docx; + }; + + var replaceEntities = function () { + // replacing of special symbols: + docx = docx.replace(new RegExp('\ä\;', 'g'), 'ä'); + docx = docx.replace(new RegExp('\ü\;', 'g'), 'ü'); + docx = docx.replace(new RegExp('\ö\;', 'g'), 'ö'); + docx = docx.replace(new RegExp('\Ä\;', 'g'), 'Ä'); + docx = docx.replace(new RegExp('\Ü\;', 'g'), 'Ü'); + docx = docx.replace(new RegExp('\Ö\;', 'g'), 'Ö'); + docx = docx.replace(new RegExp('\ß\;', 'g'), 'ß'); + docx = docx.replace(new RegExp('\ \;', 'g'), ' '); + docx = docx.replace(new RegExp('\§\;', 'g'), '§'); + + // remove all entities except gt, lt and amp + var entityRegex = /\&(?!gt|lt|amp)\w+\;/g, matchEntry, indexes = []; + while ((matchEntry = entityRegex.exec(docx)) !== null) { + indexes.push({ + startId: matchEntry.index, + stopId: matchEntry.index + matchEntry[0].length + }); + } + for (var i = indexes.length - 1; i>=0; i--) { + docx = docx.substring(0, indexes[i].startId) + docx.substring(indexes[i].stopId, docx.length); + } + }; + + var parse = function () { + if (html.substring(0,3) != '

') { + docx += ''; + skipFirstParagraphClosing = false; + } + html = html.split(/(<|>)/g); + // remove whitespaces and > brackets. Leave < brackets in there to check, whether + // the following string is a tag or text. + html = _.filter(html, function (part) { + var skippedCharsRegex = new RegExp('^([\s\n\r]|>)*$', 'gm'); + return !skippedCharsRegex.test(part); + }); + + for (var i = 0; i < html.length; i++) { + if (html[i] === '<') { + i++; + handleTag(html[i]); + } else { + handleText(html[i]); + } + } + // for finishing close the last paragraph (if open) + if (inParagraph) { + docx += ''; + } + + replaceEntities(); + + return docx; + }; + + return parse(); + }; + + // return a wrapper function for html2docx, that fetches all the images. + converter.html2docx = function (html) { + var imageSources = _.map($(html).find('img'), function (element) { + return element.getAttribute('src'); + }); + // Don't get images multiple times; just if the converter has not seen them befor. + imageSources = _.filter(imageSources, function (src) { + return !converter.imageMap[src]; + }); + return $q(function (resolve) { + ImageConverter.toBase64(imageSources).then(function (_imageMap) { + _.forEach(_imageMap, function (value, key) { + converter.imageMap[key] = value; + }); + var docx = html2docx(html); + resolve(docx); + }); + }); + }; + + converter.updateZipFile = function (zip) { + var updateRelationships = function (oldContent) { + var content = oldContent.split('\n'); + _.forEach(converter.relationships, function (relationship) { + content[1] += ')/g); - - html.forEach(function (part) { - if (part !== '' && part != '\n' && part != '<' && part != '>') { - if (isTag) { - if (part.startsWith('p')) { /** p **/ - // Special: begin new paragraph (only if its the first): - if (hasParagraph && !skipFirstParagraphClosing) { - // End, if there is one - docx += ''; - } - skipFirstParagraphClosing = false; - docx += ''; - hasParagraph = true; - } else if (part.startsWith('/p')) { - // Special: end paragraph: - docx += ''; - hasParagraph = false; - - } else if (part.charAt(0) == "/") { - // remove from stack - stack.pop(); - } else { // now all other tags - var tag = {}; - if (_.indexOf(TAGS_NO_PARAM, part) > -1) { /** b, strong, em, i **/ - stack.push({tag: part}); - } else if (part.startsWith('span')) { /** span **/ - tag = {tag: 'span', attrs: {}}; - var rStyle = /(?:\"|\;\s?)([a-zA-z\-]+)\:\s?([a-zA-Z0-9\-\#]+)/g, matchSpan; - while ((matchSpan = rStyle.exec(part)) !== null) { - switch (matchSpan[1]) { - case 'color': - tag.attrs.color = matchSpan[2].slice(1); // cut off the # - break; - case 'background-color': - tag.attrs.backgroundColor = matchSpan[2].slice(1); // cut off the # - break; - case 'text-decoration': - if (matchSpan[2] === 'underline') { - tag.attrs.underline = true; - } else if (matchSpan[2] === 'line-through') { - tag.attrs.strike = true; - } - break; - } - } - stack.push(tag); - } else if (part.startsWith('a')) { /** a **/ - var rHref = /href="([^"]+)"/g; - var href = rHref.exec(part)[1]; - tag = {tag: 'a', href: href}; - stack.push(tag); - } else if (part.startsWith('img')) { - // images has to be placed instantly, so there is no use of 'tag'. - var img = {}, rImg = /(\w+)=\"([^\"]*)\"/g, matchImg; - while ((matchImg = rImg.exec(part)) !== null) { - img[matchImg[1]] = matchImg[2]; - } - - // With and height and source have to be given! - if (img.width && img.height && img.src) { - var rrId = relationships.length + 1; - var imgId = images.length + 1; - - // set name ('pic.jpg'), title, ext ('jpg'), mime ('image/jpeg') - img.name = img.src.split('/'); - img.name = _.last(img.name); - - var tmp = img.name.split('.'); - // set name without extension as title if there isn't a title - if (!img.title) { - img.title = tmp[0]; - } - img.ext = tmp[1]; - - img.mime = 'image/' + img.ext; - if (img.ext == 'jpe' || img.ext == 'jpg') { - img.mime = 'image/jpeg'; - } - - // x and y for the container and picture size in EMU (assuming 96dpi)! - var x = img.width * 914400 / 96; - var y = img.height * 914400 / 96; - - // Own paragraph for the image - if (hasParagraph) { - docx += ''; - } - docx += '' + - '' + - '' + - '' + - '' + - '' + - ''; - - // hasParagraph stays untouched, the documents paragraph state is restored here - if (hasParagraph) { - docx += ''; - } - - // entries in images, relationships and contentTypes - images.push({ - url: img.src, - zipPath: 'word/media/' + img.name - }); - relationships.push({ - Id: 'rrId' + rrId, - Type: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image', - Target: 'media/' + img.name - }); - contentTypes.push({ - PartName: '/word/media/' + img.name, - ContentType: img.mime - }); - } - } - } - } else { /** No tag **/ - if (!hasParagraph) { - docx += ''; - hasParagraph = true; - } - var docx_part = ''; - var hyperlink = false; - stack.forEach(function (tag) { - switch (tag.tag) { - case 'b': case 'strong': - docx_part += ''; - break; - case 'em': case 'i': - docx_part += ''; - break; - case 'span': - for (var key in tag.attrs) { - switch (key) { - case 'color': - docx_part += ''; - break; - case 'backgroundColor': - docx_part += ''; - break; - case 'underline': - docx_part += ''; - break; - case 'strike': - docx_part += ''; - break; - } - } - break; - case 'a': - var id = relationships.length + 1; - docx_part = '' + docx_part; - docx_part += ''; // necessary? - relationships.push({ - Id: 'rrId' + id, - Type: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink', - Target: tag.href, - TargetMode: 'External' - }); - hyperlink = true; - break; - } + // Data for one motions. Must include translations, ... + var motionData = { + // Translations + motion_translation: translation, + sequential_translation: sequential_translation, + submitters_translation: submitters_translation, + reason_translation: reason.length === 0 ? '' : reason_translation, + status_translation: status_translation, + comment_translation: comments.length === 0 ? '' : comment_translation, + // Actual data + id: motion.id, + identifier: motion.identifier, + title: motion.getTitle(), + submitters: _.map(motion.submitters, function (submitter) { + return submitter.get_full_name(); + }).join(', '), + status: motion.getStateName(), + // Miscellaneous stuff + preamble: gettextCatalog.getString(Config.get('motions_preamble').value), + pagebreak: PAGEBREAK, + }; + // converting html to docx is async, so text, reason and comments are inserted here. + return $q(function (resolve) { + var convertPromises = _.map(comments, function (comment) { + return converter.html2docx(comment.comment).then(function (commentAsDocx) { + comment.comment = commentAsDocx; }); - docx_part += '' + part + ''; - if (hyperlink) { - docx_part += ''; - } - - // append to docx - docx += docx_part; - } - isTag = !isTag; - } - if (part === '' || part == '\n') { - // just if two tags following eachother: --> ...,'>', '', '<',... - // or there is a line break between: \n --> ...,'>', '\n', '<',... - isTag = !isTag; - } - }); - - // for finishing close the last paragraph (if open) - if (hasParagraph) { - docx += ''; - } - - // replacing of special symbols: - docx = docx.replace(new RegExp('\ä\;', 'g'), 'ä'); - docx = docx.replace(new RegExp('\ü\;', 'g'), 'ü'); - docx = docx.replace(new RegExp('\ö\;', 'g'), 'ö'); - docx = docx.replace(new RegExp('\Ä\;', 'g'), 'Ä'); - docx = docx.replace(new RegExp('\Ü\;', 'g'), 'Ü'); - docx = docx.replace(new RegExp('\Ö\;', 'g'), 'Ö'); - docx = docx.replace(new RegExp('\ß\;', 'g'), 'ß'); - docx = docx.replace(new RegExp('\ \;', 'g'), ' '); - docx = docx.replace(new RegExp('\§\;', 'g'), '§'); - - // remove all entities except gt, lt and amp - var rEntity = /\&(?!gt|lt|amp)\w+\;/g, matchEntry, indexes = []; - while ((matchEntry = rEntity.exec(docx)) !== null) { - indexes.push({ - startId: matchEntry.index, - stopId: matchEntry.index + matchEntry[0].length + }); + convertPromises.push(converter.html2docx(text).then(function (textAsDocx) { + motionData.text = textAsDocx; + })); + convertPromises.push(converter.html2docx(reason).then(function (reasonAsDocx) { + motionData.reason = reasonAsDocx; + })); + $q.all(convertPromises).then(function () { + motionData.comments = comments; + resolve(motionData); + }); }); - } - for (var i = indexes.length - 1; i>=0; i--) { - docx = docx.substring(0, indexes[i].startId) + docx.substring(indexes[i].stopId, docx.length); - } + }); + // resolve, if all motion data is fetched. + return $q(function (resolve) { + $q.all(promises).then(function (data) { + if (data.length) { + // clear pagebreak on last element + data[data.length - 1].pagebreak = ''; + } + resolve(data); + }); + }); + }; - return docx; - }; - var updateRelationships = function (oldContent) { - var content = oldContent.split('\n'); - relationships.forEach(function (rel) { - content[1] += '