diff options
author | Thomas Deutschmann <whissi@gentoo.org> | 2021-03-30 10:59:39 +0200 |
---|---|---|
committer | Thomas Deutschmann <whissi@gentoo.org> | 2021-04-01 00:04:14 +0200 |
commit | 5ff1d6955496b3cf9a35042c9ac35db43bc336b1 (patch) | |
tree | 6d470f7eb448f59f53e8df1010aec9dad8ce1f72 /extract/src/docx.c | |
parent | Import Ghostscript 9.53.1 (diff) | |
download | ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.gz ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.bz2 ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.zip |
Import Ghostscript 9.54ghostscript-9.54
Signed-off-by: Thomas Deutschmann <whissi@gentoo.org>
Diffstat (limited to 'extract/src/docx.c')
-rw-r--r-- | extract/src/docx.c | 1097 |
1 files changed, 1097 insertions, 0 deletions
diff --git a/extract/src/docx.c b/extract/src/docx.c new file mode 100644 index 00000000..238e81d4 --- /dev/null +++ b/extract/src/docx.c @@ -0,0 +1,1097 @@ +/* These extract_docx_*() functions generate docx content and docx zip archive +data. + +Caller must call things in a sensible order to create valid content - +e.g. don't call docx_paragraph_start() twice without intervening call to +docx_paragraph_finish(). */ + +#include "../include/extract.h" + +#include "docx_template.h" + +#include "astring.h" +#include "document.h" +#include "docx.h" +#include "mem.h" +#include "memento.h" +#include "outf.h" +#include "zip.h" + +#include <assert.h> +#include <errno.h> +#include <math.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <sys/stat.h> + + +static int extract_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) +{ + return extract_astring_cat(alloc, content, "\n\n<w:p>"); +} + +static int extract_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) +{ + return extract_astring_cat(alloc, content, "\n</w:p>"); +} + +static int extract_docx_run_start( + extract_alloc_t* alloc, + extract_astring_t* content, + const char* font_name, + double font_size, + int bold, + int italic + ) +/* Starts a new run. Caller must ensure that extract_docx_run_finish() was +called to terminate any previous run. */ +{ + int e = 0; + if (!e) e = extract_astring_cat(alloc, content, "\n<w:r><w:rPr><w:rFonts w:ascii=\""); + if (!e) e = extract_astring_cat(alloc, content, font_name); + if (!e) e = extract_astring_cat(alloc, content, "\" w:hAnsi=\""); + if (!e) e = extract_astring_cat(alloc, content, font_name); + if (!e) e = extract_astring_cat(alloc, content, "\"/>"); + if (!e && bold) e = extract_astring_cat(alloc, content, "<w:b/>"); + if (!e && italic) e = extract_astring_cat(alloc, content, "<w:i/>"); + { + char font_size_text[32]; + if (0) font_size = 10; + + if (!e) e = extract_astring_cat(alloc, content, "<w:sz w:val=\""); + snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 2); + extract_astring_cat(alloc, content, font_size_text); + extract_astring_cat(alloc, content, "\"/>"); + + if (!e) e = extract_astring_cat(alloc, content, "<w:szCs w:val=\""); + snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 1.5); + extract_astring_cat(alloc, content, font_size_text); + extract_astring_cat(alloc, content, "\"/>"); + } + if (!e) e = extract_astring_cat(alloc, content, "</w:rPr><w:t xml:space=\"preserve\">"); + return e; + +} + +static int extract_docx_run_finish(extract_alloc_t* alloc, extract_astring_t* content) +{ + return extract_astring_cat(alloc, content, "</w:t></w:r>"); +} + +static int extract_docx_char_append_string(extract_alloc_t* alloc, extract_astring_t* content, const char* text) +{ + return extract_astring_cat(alloc, content, text); +} + +static int extract_docx_char_append_stringf(extract_alloc_t* alloc, extract_astring_t* content, const char* format, ...) +{ + char* buffer = NULL; + int e; + va_list va; + va_start(va, format); + e = extract_vasprintf(alloc, &buffer, format, va); + va_end(va); + if (e < 0) return e; + e = extract_astring_cat(alloc, content, buffer); + extract_free(alloc, &buffer); + return e; +} + +static int extract_docx_char_append_char(extract_alloc_t* alloc, extract_astring_t* content, char c) +{ + return extract_astring_catc(alloc, content, c); +} + +static int extract_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content) +/* Append an empty paragraph to *content. */ +{ + int e = -1; + if (extract_docx_paragraph_start(alloc, content)) goto end; + /* It seems like our choice of font size here doesn't make any difference + to the ammount of vertical space, unless we include a non-space + character. Presumably something to do with the styles in the template + document. */ + if (extract_docx_run_start( + alloc, + content, + "OpenSans", + 10 /*font_size*/, + 0 /*font_bold*/, + 0 /*font_italic*/ + )) goto end; + //docx_char_append_string(content, " "); /*   is non-break space. */ + if (extract_docx_run_finish(alloc, content)) goto end; + if (extract_docx_paragraph_finish(alloc, content)) goto end; + e = 0; + end: + return e; +} + + +/* Removes last <len> chars. */ +static int docx_char_truncate(extract_astring_t* content, int len) +{ + assert((size_t) len <= content->chars_num); + content->chars_num -= len; + content->chars[content->chars_num] = 0; + return 0; +} + +static int extract_docx_char_truncate_if(extract_astring_t* content, char c) +/* Removes last char if it is <c>. */ +{ + if (content->chars_num && content->chars[content->chars_num-1] == c) { + docx_char_truncate(content, 1); + } + return 0; +} + + +static double matrices_to_font_size(matrix_t* ctm, matrix_t* trm) +{ + double font_size = matrix_expansion(*trm) + * matrix_expansion(*ctm); + /* Round font_size to nearest 0.01. */ + font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; + return font_size; +} + +typedef struct +{ + const char* font_name; + double font_size; + int font_bold; + int font_italic; + matrix_t* ctm_prev; +} content_state_t; +/* Used to keep track of font information when writing paragraphs of docx +content, e.g. so we know whether a font has changed so need to start a new docx +span. */ + + +static int extract_document_to_docx_content_paragraph( + extract_alloc_t* alloc, + content_state_t* state, + paragraph_t* paragraph, + extract_astring_t* content + ) +/* Append docx xml for <paragraph> to <content>. Updates *state if we change +font. */ +{ + int e = -1; + int l; + if (extract_docx_paragraph_start(alloc, content)) goto end; + + for (l=0; l<paragraph->lines_num; ++l) { + line_t* line = paragraph->lines[l]; + int s; + for (s=0; s<line->spans_num; ++s) { + int si; + span_t* span = line->spans[s]; + double font_size_new; + state->ctm_prev = &span->ctm; + font_size_new = matrices_to_font_size(&span->ctm, &span->trm); + if (!state->font_name + || strcmp(span->font_name, state->font_name) + || span->font_bold != state->font_bold + || span->font_italic != state->font_italic + || font_size_new != state->font_size + ) { + if (state->font_name) { + if (extract_docx_run_finish(alloc, content)) goto end; + } + state->font_name = span->font_name; + state->font_bold = span->font_bold; + state->font_italic = span->font_italic; + state->font_size = font_size_new; + if (extract_docx_run_start( + alloc, + content, + state->font_name, + state->font_size, + state->font_bold, + state->font_italic + )) goto end; + } + + for (si=0; si<span->chars_num; ++si) { + char_t* char_ = &span->chars[si]; + int c = char_->ucs; + + if (0) {} + + /* Escape XML special characters. */ + else if (c == '<') extract_docx_char_append_string(alloc, content, "<"); + else if (c == '>') extract_docx_char_append_string(alloc, content, ">"); + else if (c == '&') extract_docx_char_append_string(alloc, content, "&"); + else if (c == '"') extract_docx_char_append_string(alloc, content, """); + else if (c == '\'') extract_docx_char_append_string(alloc, content, "'"); + + /* Expand ligatures. */ + else if (c == 0xFB00) { + if (extract_docx_char_append_string(alloc, content, "ff")) goto end; + } + else if (c == 0xFB01) { + if (extract_docx_char_append_string(alloc, content, "fi")) goto end; + } + else if (c == 0xFB02) { + if (extract_docx_char_append_string(alloc, content, "fl")) goto end; + } + else if (c == 0xFB03) { + if (extract_docx_char_append_string(alloc, content, "ffi")) goto end; + } + else if (c == 0xFB04) { + if (extract_docx_char_append_string(alloc, content, "ffl")) goto end; + } + + /* Output ASCII verbatim. */ + else if (c >= 32 && c <= 127) { + if (extract_docx_char_append_char(alloc, content, (char) c)) goto end; + } + + /* Escape all other characters. */ + else { + char buffer[32]; + snprintf(buffer, sizeof(buffer), "&#x%x;", c); + if (extract_docx_char_append_string(alloc, content, buffer)) goto end; + } + } + /* Remove any trailing '-' at end of line. */ + if (extract_docx_char_truncate_if(content, '-')) goto end; + } + } + if (state->font_name) { + if (extract_docx_run_finish(alloc, content)) goto end; + state->font_name = NULL; + } + if (extract_docx_paragraph_finish(alloc, content)) goto end; + + e = 0; + + end: + return e; +} + +static int extract_document_append_image( + extract_alloc_t* alloc, + extract_astring_t* content, + image_t* image + ) +/* Write reference to image into docx content. */ +{ + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, " <w:p>\n"); + extract_docx_char_append_string(alloc, content, " <w:r>\n"); + extract_docx_char_append_string(alloc, content, " <w:rPr>\n"); + extract_docx_char_append_string(alloc, content, " <w:noProof/>\n"); + extract_docx_char_append_string(alloc, content, " </w:rPr>\n"); + extract_docx_char_append_string(alloc, content, " <w:drawing>\n"); + extract_docx_char_append_string(alloc, content, " <wp:inline distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" wp14:anchorId=\"7057A832\" wp14:editId=\"466EB3FB\">\n"); + extract_docx_char_append_string(alloc, content, " <wp:extent cx=\"2933700\" cy=\"2200275\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wp:effectExtent l=\"0\" t=\"0\" r=\"0\" b=\"9525\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wp:docPr id=\"1\" name=\"Picture 1\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wp:cNvGraphicFramePr>\n"); + extract_docx_char_append_string(alloc, content, " <a:graphicFrameLocks xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" noChangeAspect=\"1\"/>\n"); + extract_docx_char_append_string(alloc, content, " </wp:cNvGraphicFramePr>\n"); + extract_docx_char_append_string(alloc, content, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); + extract_docx_char_append_string(alloc, content, " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); + extract_docx_char_append_string(alloc, content, " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); + extract_docx_char_append_string(alloc, content, " <pic:nvPicPr>\n"); + extract_docx_char_append_string(alloc, content, " <pic:cNvPr id=\"1\" name=\"Picture 1\"/>\n"); + extract_docx_char_append_string(alloc, content, " <pic:cNvPicPr>\n"); + extract_docx_char_append_string(alloc, content, " <a:picLocks noChangeAspect=\"1\" noChangeArrowheads=\"1\"/>\n"); + extract_docx_char_append_string(alloc, content, " </pic:cNvPicPr>\n"); + extract_docx_char_append_string(alloc, content, " </pic:nvPicPr>\n"); + extract_docx_char_append_string(alloc, content, " <pic:blipFill>\n"); + extract_docx_char_append_stringf(alloc, content," <a:blip r:embed=\"%s\">\n", image->id); + extract_docx_char_append_string(alloc, content, " <a:extLst>\n"); + extract_docx_char_append_string(alloc, content, " <a:ext uri=\"{28A0092B-C50C-407E-A947-70E740481C1C}\">\n"); + extract_docx_char_append_string(alloc, content, " <a14:useLocalDpi xmlns:a14=\"http://schemas.microsoft.com/office/drawing/2010/main\" val=\"0\"/>\n"); + extract_docx_char_append_string(alloc, content, " </a:ext>\n"); + extract_docx_char_append_string(alloc, content, " </a:extLst>\n"); + extract_docx_char_append_string(alloc, content, " </a:blip>\n"); + //extract_docx_char_append_string(alloc, content, " <a:srcRect/>\n"); + extract_docx_char_append_string(alloc, content, " <a:stretch>\n"); + extract_docx_char_append_string(alloc, content, " <a:fillRect/>\n"); + extract_docx_char_append_string(alloc, content, " </a:stretch>\n"); + extract_docx_char_append_string(alloc, content, " </pic:blipFill>\n"); + extract_docx_char_append_string(alloc, content, " <pic:spPr bwMode=\"auto\">\n"); + extract_docx_char_append_string(alloc, content, " <a:xfrm>\n"); + extract_docx_char_append_string(alloc, content, " <a:off x=\"0\" y=\"0\"/>\n"); + extract_docx_char_append_string(alloc, content, " <a:ext cx=\"2933700\" cy=\"2200275\"/>\n"); + extract_docx_char_append_string(alloc, content, " </a:xfrm>\n"); + extract_docx_char_append_string(alloc, content, " <a:prstGeom prst=\"rect\">\n"); + extract_docx_char_append_string(alloc, content, " <a:avLst/>\n"); + extract_docx_char_append_string(alloc, content, " </a:prstGeom>\n"); + extract_docx_char_append_string(alloc, content, " <a:noFill/>\n"); + extract_docx_char_append_string(alloc, content, " <a:ln>\n"); + extract_docx_char_append_string(alloc, content, " <a:noFill/>\n"); + extract_docx_char_append_string(alloc, content, " </a:ln>\n"); + extract_docx_char_append_string(alloc, content, " </pic:spPr>\n"); + extract_docx_char_append_string(alloc, content, " </pic:pic>\n"); + extract_docx_char_append_string(alloc, content, " </a:graphicData>\n"); + extract_docx_char_append_string(alloc, content, " </a:graphic>\n"); + extract_docx_char_append_string(alloc, content, " </wp:inline>\n"); + extract_docx_char_append_string(alloc, content, " </w:drawing>\n"); + extract_docx_char_append_string(alloc, content, " </w:r>\n"); + extract_docx_char_append_string(alloc, content, " </w:p>\n"); + extract_docx_char_append_string(alloc, content, "\n"); + return 0; +} + + +static int extract_document_output_rotated_paragraphs( + extract_alloc_t* alloc, + page_t* page, + int paragraph_begin, + int paragraph_end, + int rot, + int x, + int y, + int w, + int h, + int text_box_id, + extract_astring_t* content, + content_state_t* state + ) +/* Writes paragraph to content inside rotated text box. */ +{ + int e = 0; + int p; + outf("x,y=%ik,%ik = %i,%i", x/1000, y/1000, x, y); + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, "<w:p>\n"); + extract_docx_char_append_string(alloc, content, " <w:r>\n"); + extract_docx_char_append_string(alloc, content, " <mc:AlternateContent>\n"); + extract_docx_char_append_string(alloc, content, " <mc:Choice Requires=\"wps\">\n"); + extract_docx_char_append_string(alloc, content, " <w:drawing>\n"); + extract_docx_char_append_string(alloc, content, " <wp:anchor distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" simplePos=\"0\" relativeHeight=\"0\" behindDoc=\"0\" locked=\"0\" layoutInCell=\"1\" allowOverlap=\"1\" wp14:anchorId=\"53A210D1\" wp14:editId=\"2B7E8016\">\n"); + extract_docx_char_append_string(alloc, content, " <wp:simplePos x=\"0\" y=\"0\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wp:positionH relativeFrom=\"page\">\n"); + extract_docx_char_append_stringf(alloc, content," <wp:posOffset>%i</wp:posOffset>\n", x); + extract_docx_char_append_string(alloc, content, " </wp:positionH>\n"); + extract_docx_char_append_string(alloc, content, " <wp:positionV relativeFrom=\"page\">\n"); + extract_docx_char_append_stringf(alloc, content," <wp:posOffset>%i</wp:posOffset>\n", y); + extract_docx_char_append_string(alloc, content, " </wp:positionV>\n"); + extract_docx_char_append_stringf(alloc, content," <wp:extent cx=\"%i\" cy=\"%i\"/>\n", w, h); + extract_docx_char_append_string(alloc, content, " <wp:effectExtent l=\"381000\" t=\"723900\" r=\"371475\" b=\"723900\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wp:wrapNone/>\n"); + extract_docx_char_append_stringf(alloc, content," <wp:docPr id=\"%i\" name=\"Text Box %i\"/>\n", text_box_id, text_box_id); + extract_docx_char_append_string(alloc, content, " <wp:cNvGraphicFramePr/>\n"); + extract_docx_char_append_string(alloc, content, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); + extract_docx_char_append_string(alloc, content, " <a:graphicData uri=\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\">\n"); + extract_docx_char_append_string(alloc, content, " <wps:wsp>\n"); + extract_docx_char_append_string(alloc, content, " <wps:cNvSpPr txBox=\"1\"/>\n"); + extract_docx_char_append_string(alloc, content, " <wps:spPr>\n"); + extract_docx_char_append_stringf(alloc, content," <a:xfrm rot=\"%i\">\n", rot); + extract_docx_char_append_string(alloc, content, " <a:off x=\"0\" y=\"0\"/>\n"); + extract_docx_char_append_string(alloc, content, " <a:ext cx=\"3228975\" cy=\"2286000\"/>\n"); + extract_docx_char_append_string(alloc, content, " </a:xfrm>\n"); + extract_docx_char_append_string(alloc, content, " <a:prstGeom prst=\"rect\">\n"); + extract_docx_char_append_string(alloc, content, " <a:avLst/>\n"); + extract_docx_char_append_string(alloc, content, " </a:prstGeom>\n"); + + /* Give box a solid background. */ + if (0) { + extract_docx_char_append_string(alloc, content, " <a:solidFill>\n"); + extract_docx_char_append_string(alloc, content, " <a:schemeClr val=\"lt1\"/>\n"); + extract_docx_char_append_string(alloc, content, " </a:solidFill>\n"); + } + + /* Draw line around box. */ + if (0) { + extract_docx_char_append_string(alloc, content, " <a:ln w=\"175\">\n"); + extract_docx_char_append_string(alloc, content, " <a:solidFill>\n"); + extract_docx_char_append_string(alloc, content, " <a:prstClr val=\"black\"/>\n"); + extract_docx_char_append_string(alloc, content, " </a:solidFill>\n"); + extract_docx_char_append_string(alloc, content, " </a:ln>\n"); + } + + extract_docx_char_append_string(alloc, content, " </wps:spPr>\n"); + extract_docx_char_append_string(alloc, content, " <wps:txbx>\n"); + extract_docx_char_append_string(alloc, content, " <w:txbxContent>"); + + #if 0 + if (0) { + /* Output inline text describing the rotation. */ + extract_docx_char_append_stringf(content, "<w:p>\n" + "<w:r><w:rPr><w:rFonts w:ascii=\"OpenSans\" w:hAnsi=\"OpenSans\"/><w:sz w:val=\"20.000000\"/><w:szCs w:val=\"15.000000\"/></w:rPr><w:t xml:space=\"preserve\">*** rotate: %f rad, %f deg. rot=%i</w:t></w:r>\n" + "</w:p>\n", + rotate, + rotate * 180 / pi, + rot + ); + } + #endif + + /* Output paragraphs p0..p2-1. */ + for (p=paragraph_begin; p<paragraph_end; ++p) { + paragraph_t* paragraph = page->paragraphs[p]; + if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; + } + + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, " </w:txbxContent>\n"); + extract_docx_char_append_string(alloc, content, " </wps:txbx>\n"); + extract_docx_char_append_string(alloc, content, " <wps:bodyPr rot=\"0\" spcFirstLastPara=\"0\" vertOverflow=\"overflow\" horzOverflow=\"overflow\" vert=\"horz\" wrap=\"square\" lIns=\"91440\" tIns=\"45720\" rIns=\"91440\" bIns=\"45720\" numCol=\"1\" spcCol=\"0\" rtlCol=\"0\" fromWordArt=\"0\" anchor=\"t\" anchorCtr=\"0\" forceAA=\"0\" compatLnSpc=\"1\">\n"); + extract_docx_char_append_string(alloc, content, " <a:prstTxWarp prst=\"textNoShape\">\n"); + extract_docx_char_append_string(alloc, content, " <a:avLst/>\n"); + extract_docx_char_append_string(alloc, content, " </a:prstTxWarp>\n"); + extract_docx_char_append_string(alloc, content, " <a:noAutofit/>\n"); + extract_docx_char_append_string(alloc, content, " </wps:bodyPr>\n"); + extract_docx_char_append_string(alloc, content, " </wps:wsp>\n"); + extract_docx_char_append_string(alloc, content, " </a:graphicData>\n"); + extract_docx_char_append_string(alloc, content, " </a:graphic>\n"); + extract_docx_char_append_string(alloc, content, " </wp:anchor>\n"); + extract_docx_char_append_string(alloc, content, " </w:drawing>\n"); + extract_docx_char_append_string(alloc, content, " </mc:Choice>\n"); + + /* This fallback is copied from a real Word document. Not sure + whether it works - both Libreoffice and Word use the above + choice. */ + extract_docx_char_append_string(alloc, content, " <mc:Fallback>\n"); + extract_docx_char_append_string(alloc, content, " <w:pict>\n"); + extract_docx_char_append_string(alloc, content, " <v:shapetype w14:anchorId=\"53A210D1\" id=\"_x0000_t202\" coordsize=\"21600,21600\" o:spt=\"202\" path=\"m,l,21600r21600,l21600,xe\">\n"); + extract_docx_char_append_string(alloc, content, " <v:stroke joinstyle=\"miter\"/>\n"); + extract_docx_char_append_string(alloc, content, " <v:path gradientshapeok=\"t\" o:connecttype=\"rect\"/>\n"); + extract_docx_char_append_string(alloc, content, " </v:shapetype>\n"); + extract_docx_char_append_stringf(alloc, content," <v:shape id=\"Text Box %i\" o:spid=\"_x0000_s1026\" type=\"#_x0000_t202\" style=\"position:absolute;margin-left:71.25pt;margin-top:48.75pt;width:254.25pt;height:180pt;rotation:-2241476fd;z-index:251659264;visibility:visible;mso-wrap-style:square;mso-wrap-distance-left:9pt;mso-wrap-distance-top:0;mso-wrap-distance-right:9pt;mso-wrap-distance-bottom:0;mso-position-horizontal:absolute;mso-position-horizontal-relative:text;mso-position-vertical:absolute;mso-position-vertical-relative:text;v-text-anchor:top\" o:gfxdata=\"UEsDBBQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbJSRQU7DMBBF 90jcwfIWJU67QAgl6YK0S0CoHGBkTxKLZGx5TGhvj5O2G0SRWNoz/78nu9wcxkFMGNg6quQqL6RA 0s5Y6ir5vt9lD1JwBDIwOMJKHpHlpr69KfdHjyxSmriSfYz+USnWPY7AufNIadK6MEJMx9ApD/oD OlTrorhX2lFEilmcO2RdNtjC5xDF9pCuTyYBB5bi6bQ4syoJ3g9WQ0ymaiLzg5KdCXlKLjvcW893 SUOqXwnz5DrgnHtJTxOsQfEKIT7DmDSUCaxw7Rqn8787ZsmRM9e2VmPeBN4uqYvTtW7jvijg9N/y JsXecLq0q+WD6m8AAAD//wMAUEsDBBQABgAIAAAAIQA4/SH/1gAAAJQBAAALAAAAX3JlbHMvLnJl bHOkkMFqwzAMhu+DvYPRfXGawxijTi+j0GvpHsDYimMaW0Yy2fr2M4PBMnrbUb/Q94l/f/hMi1qR JVI2sOt6UJgd+ZiDgffL8ekFlFSbvV0oo4EbChzGx4f9GRdb25HMsYhqlCwG5lrLq9biZkxWOiqY 22YiTra2kYMu1l1tQD30/bPm3wwYN0x18gb45AdQl1tp5j/sFB2T0FQ7R0nTNEV3j6o9feQzro1i OWA14Fm+Q8a1a8+Bvu/d/dMb2JY5uiPbhG/ktn4cqGU/er3pcvwCAAD//wMAUEsDBBQABgAIAAAA IQDQg5pQVgIAALEEAAAOAAAAZHJzL2Uyb0RvYy54bWysVE1v2zAMvQ/YfxB0X+2k+WiDOEXWosOA oi3QDj0rstwYk0VNUmJ3v35PipMl3U7DLgJFPj+Rj6TnV12j2VY5X5Mp+OAs50wZSWVtXgv+7fn2 0wVnPghTCk1GFfxNeX61+Phh3tqZGtKadKkcA4nxs9YWfB2CnWWZl2vVCH9GVhkEK3KNCLi616x0 ogV7o7Nhnk+yllxpHUnlPbw3uyBfJP6qUjI8VJVXgemCI7eQTpfOVTyzxVzMXp2w61r2aYh/yKIR tcGjB6obEQTbuPoPqqaWjjxV4UxSk1FV1VKlGlDNIH9XzdNaWJVqgTjeHmTy/49W3m8fHatL9I4z Ixq06Fl1gX2mjg2iOq31M4CeLGChgzsie7+HMxbdVa5hjiDu4HI8ml5MpkkLVMcAh+xvB6kjt4Tz fDi8uJyOOZOIwZ7keWpGtmOLrNb58EVRw6JRcIdeJlqxvfMBGQC6h0S4J12Xt7XW6RLnR11rx7YC ndch5YwvTlDasLbgk/NxnohPYpH68P1KC/k9Vn3KgJs2cEaNdlpEK3SrrhdoReUbdEvSQAZv5W0N 3jvhw6NwGDQ4sTzhAUelCclQb3G2Jvfzb/6IR/8R5azF4Bbc/9gIpzjTXw0m43IwGsVJT5fReDrE xR1HVscRs2muCQqh+8gumREf9N6sHDUv2LFlfBUhYSTeLnjYm9dht07YUamWywTCbFsR7syTlZF6 383n7kU42/czYBTuaT/iYvaurTts/NLQchOoqlPPo8A7VXvdsRepLf0Ox8U7vifU7z/N4hcAAAD/ /wMAUEsDBBQABgAIAAAAIQBh17L63wAAAAoBAAAPAAAAZHJzL2Rvd25yZXYueG1sTI9BT4NAEIXv Jv6HzZh4s0ubgpayNIboSW3Syg9Y2BGI7CyyS0v99Y4nPU3ezMub72W72fbihKPvHClYLiIQSLUz HTUKyvfnuwcQPmgyuneECi7oYZdfX2U6Ne5MBzwdQyM4hHyqFbQhDKmUvm7Rar9wAxLfPtxodWA5 NtKM+szhtperKEqk1R3xh1YPWLRYfx4nq8APVfz9VQxPb+WUNC+vZbGPDhelbm/mxy2IgHP4M8Mv PqNDzkyVm8h40bNer2K2Ktjc82RDEi+5XKVgHfNG5pn8XyH/AQAA//8DAFBLAQItABQABgAIAAAA IQC2gziS/gAAAOEBAAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0A FAAGAAgAAAAhADj9If/WAAAAlAEAAAsAAAAAAAAAAAAAAAAALwEAAF9yZWxzLy5yZWxzUEsBAi0A FAAGAAgAAAAhANCDmlBWAgAAsQQAAA4AAAAAAAAAAAAAAAAALgIAAGRycy9lMm9Eb2MueG1sUEsB Ai0AFAAGAAgAAAAhAGHXsvrfAAAACgEAAA8AAAAAAAAAAAAAAAAAsAQAAGRycy9kb3ducmV2Lnht bFBLBQYAAAAABAAEAPMAAAC8BQAAAAA= \" fillcolor=\"white [3201]\" strokeweight=\".5pt\">\n", text_box_id); + extract_docx_char_append_string(alloc, content, " <v:textbox>\n"); + extract_docx_char_append_string(alloc, content, " <w:txbxContent>"); + + for (p=paragraph_begin; p<paragraph_end; ++p) { + paragraph_t* paragraph = page->paragraphs[p]; + if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; + } + + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, "\n"); + extract_docx_char_append_string(alloc, content, " </w:txbxContent>\n"); + extract_docx_char_append_string(alloc, content, " </v:textbox>\n"); + extract_docx_char_append_string(alloc, content, " </v:shape>\n"); + extract_docx_char_append_string(alloc, content, " </w:pict>\n"); + extract_docx_char_append_string(alloc, content, " </mc:Fallback>\n"); + extract_docx_char_append_string(alloc, content, " </mc:AlternateContent>\n"); + extract_docx_char_append_string(alloc, content, " </w:r>\n"); + extract_docx_char_append_string(alloc, content, "</w:p>"); + e = 0; + end: + return e; +} + + +int extract_document_to_docx_content( + extract_alloc_t* alloc, + document_t* document, + int spacing, + int rotation, + int images, + extract_astring_t* content + ) +{ + int ret = -1; + int text_box_id = 0; + int p; + + /* Write paragraphs into <content>. */ + for (p=0; p<document->pages_num; ++p) { + page_t* page = document->pages[p]; + int p; + content_state_t state; + state.font_name = NULL; + state.font_size = 0; + state.font_bold = 0; + state.font_italic = 0; + state.ctm_prev = NULL; + + for (p=0; p<page->paragraphs_num; ++p) { + paragraph_t* paragraph = page->paragraphs[p]; + const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; + double rotate = atan2(ctm->b, ctm->a); + + if (spacing + && state.ctm_prev + && paragraph->lines_num + && paragraph->lines[0]->spans_num + && matrix_cmp4( + state.ctm_prev, + ¶graph->lines[0]->spans[0]->ctm + ) + ) { + /* Extra vertical space between paragraphs that were at + different angles in the original document. */ + if (extract_docx_paragraph_empty(alloc, content)) goto end; + } + + if (spacing) { + /* Extra vertical space between paragraphs. */ + if (extract_docx_paragraph_empty(alloc, content)) goto end; + } + + if (rotation && rotate != 0) { + + /* Find extent of paragraphs with this same rotation. extent + will contain max width and max height of paragraphs, in units + before application of ctm, i.e. before rotation. */ + point_t extent = {0, 0}; + int p0 = p; + int p1; + + outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", + rotate, rotate * 180 / pi, + ctm->e, + ctm->f, + ctm->a, + ctm->b, + ctm->c, + ctm->d + ); + + { + /* We assume that first span is at origin of text + block. This assumes left-to-right text. */ + double rotate0 = rotate; + const matrix_t* ctm0 = ctm; + point_t origin = { + paragraph->lines[0]->spans[0]->chars[0].x, + paragraph->lines[0]->spans[0]->chars[0].y + }; + matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; + double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; + if (ctm_det != 0) { + ctm_inverse.a = +ctm->d / ctm_det; + ctm_inverse.b = -ctm->b / ctm_det; + ctm_inverse.c = -ctm->c / ctm_det; + ctm_inverse.d = +ctm->a / ctm_det; + } + else { + outf("cannot invert ctm=(%f %f %f %f)", + ctm->a, ctm->b, ctm->c, ctm->d); + } + + for (p=p0; p<page->paragraphs_num; ++p) { + paragraph = page->paragraphs[p]; + ctm = ¶graph->lines[0]->spans[0]->ctm; + rotate = atan2(ctm->b, ctm->a); + if (rotate != rotate0) { + break; + } + + /* Update <extent>. */ + { + int l; + for (l=0; l<paragraph->lines_num; ++l) { + line_t* line = paragraph->lines[l]; + span_t* span = line_span_last(line); + char_t* char_ = span_char_last(span); + double adv = char_->adv * matrix_expansion(span->trm); + double x = char_->x + adv * cos(rotate); + double y = char_->y + adv * sin(rotate); + + double dx = x - origin.x; + double dy = y - origin.y; + + /* Position relative to origin and before box rotation. */ + double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; + double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; + yy = -yy; + if (xx > extent.x) extent.x = xx; + if (yy > extent.y) extent.y = yy; + if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", + rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span)); + } + } + } + p1 = p; + rotate = rotate0; + ctm = ctm0; + outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", + rotate, p0, p1, extent.x, extent.y); + } + + /* Paragraphs p0..p1-1 have same rotation. We output them into + a single rotated text box. */ + + /* We need unique id for text box. */ + text_box_id += 1; + + { + /* Angles are in units of 1/60,000 degree. */ + int rot = (int) (rotate * 180 / pi * 60000); + + /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm. + relativeHeight is z-ordering. (wp:positionV:wp:posOffset, + wp:positionV:wp:posOffset) is position of origin of box in + EMU. + + The box rotates about its centre but we want to rotate + about the origin (top-left). So we correct the position of + box by subtracting the vector that the top-left moves when + rotated by angle <rotate> about the middle. */ + double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ + int x = (int) (ctm->e * point_to_emu); + int y = (int) (ctm->f * point_to_emu); + int w = (int) (extent.x * point_to_emu); + int h = (int) (extent.y * point_to_emu); + int dx; + int dy; + + if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot); + + h *= 2; + /* We can't predict how much space Word will actually + require for the rotated text, so make the box have the + original width but allow text to take extra vertical + space. There doesn't seem to be a way to make the text box + auto-grow to contain the text. */ + + dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0); + dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0); + outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik", + ctm->e, + ctm->f, + rotate * 180/pi, + x/1000, + y/1000, + dx/1000, + dy/1000 + ); + x -= dx; + y -= -dy; + + if (extract_document_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, text_box_id, content, &state)) goto end; + } + p = p1 - 1; + //p = page->paragraphs_num - 1; + } + else { + if (extract_document_to_docx_content_paragraph(alloc, &state, paragraph, content)) goto end; + } + + } + + if (images) { + int i; + for (i=0; i<page->images_num; ++i) { + extract_document_append_image(alloc, content, &page->images[i]); + } + } + } + ret = 0; + + end: + + return ret; +} + + + +static int systemf(extract_alloc_t* alloc, const char* format, ...) +/* Like system() but takes printf-style format and args. Also, if we return +ve +we set errno to EIO. */ +{ + int e; + char* command; + va_list va; + va_start(va, format); + e = extract_vasprintf(alloc, &command, format, va); + va_end(va); + if (e < 0) return e; + outf("running: %s", command); + e = system(command); + extract_free(alloc, &command); + if (e > 0) { + errno = EIO; + } + return e; +} + +static int read_all(extract_alloc_t* alloc, FILE* in, char** o_out) +/* Reads until eof into zero-terminated malloc'd buffer. */ +{ + size_t len = 0; + size_t delta = 128; + for(;;) { + size_t n; + if (extract_realloc2(alloc, o_out, len, len + delta + 1)) { + extract_free(alloc, o_out); + return -1; + } + n = fread(*o_out + len, 1 /*size*/, delta /*nmemb*/, in); + len += n; + if (feof(in)) { + (*o_out)[len] = 0; + return 0; + } + if (ferror(in)) { + /* It's weird that fread() and ferror() don't set errno. */ + errno = EIO; + extract_free(alloc, o_out); + return -1; + } + } +} + +static int read_all_path(extract_alloc_t* alloc, const char* path, char** o_text) +/* Reads entire file into zero-terminated malloc'd buffer. */ +{ + int e = -1; + FILE* f = NULL; + f = fopen(path, "rb"); + if (!f) goto end; + if (read_all(alloc, f, o_text)) goto end; + e = 0; + end: + if (f) fclose(f); + if (e) extract_free(alloc, &o_text); + return e; +} + +static int write_all(const void* data, size_t data_size, const char* path) +{ + int e = -1; + FILE* f = fopen(path, "w"); + if (!f) goto end; + if (fwrite(data, data_size, 1 /*nmemb*/, f) != 1) goto end; + e = 0; + end: + if (f) fclose(f); + return e; +} + +static int extract_docx_content_insert( + extract_alloc_t* alloc, + const char* original, + const char* mid_begin_name, + const char* mid_end_name, + extract_astring_t* contentss, + int contentss_num, + char** o_out + ) +/* Creates a string consisting of <original> with all strings in <contentss> +inserted into <original>'s <mid_begin_name>...<mid_end_name> region, and +appends this string to *o_out. */ +{ + int e = -1; + const char* mid_begin; + const char* mid_end; + extract_astring_t out; + extract_astring_init(&out); + + mid_begin = strstr(original, mid_begin_name); + if (!mid_begin) { + outf("error: could not find '%s' in docx content", + mid_begin_name); + errno = ESRCH; + goto end; + } + mid_begin += strlen(mid_begin_name); + + mid_end = strstr(mid_begin, mid_end_name); + if (!mid_end) { + outf("error: could not find '%s' in docx content", + mid_end_name); + errno = ESRCH; + goto end; + } + + if (extract_astring_catl(alloc, &out, original, mid_begin - original)) goto end; + { + int i; + for (i=0; i<contentss_num; ++i) { + if (extract_astring_catl(alloc, &out, contentss[i].chars, contentss[i].chars_num)) goto end; + } + } + if (extract_astring_cat(alloc, &out, mid_end)) goto end; + + *o_out = out.chars; + out.chars = NULL; + e = 0; + + end: + if (e) { + extract_astring_free(alloc, &out); + *o_out = NULL; + } + return e; +} + +static int s_find_mid(const char* text, const char* begin, const char* end, const char** o_begin, const char** o_end) +/* Sets *o_begin to end of first occurrence of <begin> in <text>, and *o_end to +beginning of first occurtence of <end> in <text>. */ +{ + *o_begin = strstr(text, begin); + if (!*o_begin) goto fail; + *o_begin += strlen(begin); + *o_end = strstr(*o_begin, end); + if (!*o_end) goto fail; + return 0; + fail: + errno = ESRCH; + return -1; +} + + +int extract_docx_content_item( + extract_alloc_t* alloc, + extract_astring_t* contentss, + int contentss_num, + images_t* images, + const char* name, + const char* text, + char** text2 + ) +{ + int e = -1; + extract_astring_t temp; + extract_astring_init(&temp); + *text2 = NULL; + + if (0) + {} + else if (!strcmp(name, "[Content_Types].xml")) { + /* Add information about all image types that we are going to use. */ + const char* begin; + const char* end; + const char* insert; + int it; + extract_astring_free(alloc, &temp); + outf("text: %s", text); + if (s_find_mid(text, "<Types ", "</Types>", &begin, &end)) goto end; + + insert = begin; + insert = strchr(insert, '>'); + assert(insert); + insert += 1; + + if (extract_astring_catl(alloc, &temp, text, insert - text)) goto end; + outf("images->imagetypes_num=%i", images->imagetypes_num); + for (it=0; it<images->imagetypes_num; ++it) { + const char* imagetype = images->imagetypes[it]; + if (extract_astring_cat(alloc, &temp, "<Default Extension=\"")) goto end; + if (extract_astring_cat(alloc, &temp, imagetype)) goto end; + if (extract_astring_cat(alloc, &temp, "\" ContentType=\"image/")) goto end; + if (extract_astring_cat(alloc, &temp, imagetype)) goto end; + if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; + } + if (extract_astring_cat(alloc, &temp, insert)) goto end; + *text2 = temp.chars; + extract_astring_init(&temp); + } + else if (!strcmp(name, "word/_rels/document.xml.rels")) { + /* Add relationships between image ids and image names within docx + archive. */ + const char* begin; + const char* end; + int j; + extract_astring_free(alloc, &temp); + if (s_find_mid(text, "<Relationships", "</Relationships>", &begin, &end)) goto end; + if (extract_astring_catl(alloc, &temp, text, end - text)) goto end; + outf("images.images_num=%i", images->images_num); + for (j=0; j<images->images_num; ++j) { + image_t* image = &images->images[j]; + if (extract_astring_cat(alloc, &temp, "<Relationship Id=\"")) goto end; + if (extract_astring_cat(alloc, &temp, image->id)) goto end; + if (extract_astring_cat(alloc, &temp, "\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image\" Target=\"media/")) goto end; + if (extract_astring_cat(alloc, &temp, image->name)) goto end; + if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; + } + if (extract_astring_cat(alloc, &temp, end)) goto end; + *text2 = temp.chars; + extract_astring_init(&temp); + } + else if (!strcmp(name, "word/document.xml")) { + /* Insert paragraphs content. */ + if (extract_docx_content_insert( + alloc, + text, + "<w:body>", + "</w:body>", + contentss, + contentss_num, + text2 + )) goto end; + } + else { + *text2 = NULL; + } + e = 0; + end: + if (e) { + /* We might have set <text2> to new content. */ + extract_free(alloc, text2); + /* We might have used <temp> as a temporary buffer. */ + extract_astring_free(alloc, &temp); + } + extract_astring_init(&temp); + return e; +} + + + +static int check_path_shell_safe(const char* path) +/* Returns -1 with errno=EINVAL if <path> contains sequences that could make it +unsafe in shell commands. */ +{ + if (0 + || strstr(path, "..") + || strchr(path, '\'') + || strchr(path, '"') + || strchr(path, ' ') + ) { + errno = EINVAL; + return -1; + } + return 0; +} + +static int remove_directory(extract_alloc_t* alloc, const char* path) +{ + if (check_path_shell_safe(path)) { + outf("path_out is unsafe: %s", path); + return -1; + } + return systemf(alloc, "rm -r '%s'", path); +} + +#ifdef _WIN32 +#include <direct.h> +static int s_mkdir(const char* path, int mode) +{ + (void) mode; + return _mkdir(path); +} +#else +static int s_mkdir(const char* path, int mode) +{ + return mkdir(path, mode); +} +#endif + + +int extract_docx_write_template( + extract_alloc_t* alloc, + extract_astring_t* contentss, + int contentss_num, + images_t* images, + const char* path_template, + const char* path_out, + int preserve_dir + ) +{ + int e = -1; + int i; + char* path_tempdir = NULL; + FILE* f = NULL; + char* path = NULL; + char* text = NULL; + char* text2 = NULL; + + assert(path_out); + assert(path_template); + + if (check_path_shell_safe(path_out)) { + outf("path_out is unsafe: %s", path_out); + goto end; + } + + outf("images->images_num=%i", images->images_num); + if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end; + if (systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end; + + if (s_mkdir(path_tempdir, 0777)) { + outf("Failed to create directory: %s", path_tempdir); + goto end; + } + + outf("Unzipping template document '%s' to tempdir: %s", + path_template, path_tempdir); + e = systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template); + if (e) { + outf("Failed to unzip %s into %s", + path_template, path_tempdir); + goto end; + } + + /* Might be nice to iterate through all items in path_tempdir, but for now + we look at just the items that we know extract_docx_content_item() will + modify. */ + + { + const char* names[] = { + "word/document.xml", + "[Content_Types].xml", + "word/_rels/document.xml.rels", + }; + int names_num = sizeof(names) / sizeof(names[0]); + for (i=0; i<names_num; ++i) { + const char* name = names[i]; + extract_free(alloc, &path); + extract_free(alloc, &text); + extract_free(alloc, &text2); + if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; + if (read_all_path(alloc, path, &text)) goto end; + + if (extract_docx_content_item( + alloc, + contentss, + contentss_num, + images, + name, + text, + &text2 + )) goto end; + { + const char* text3 = (text2) ? text2 : text; + if (write_all(text3, strlen(text3), path)) goto end; + } + } + } + + /* Copy images into <path_tempdir>/media/. */ + extract_free(alloc, &path); + if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; + if (s_mkdir(path, 0777)) goto end; + + for (i=0; i<images->images_num; ++i) { + image_t* image = &images->images[i]; + extract_free(alloc, &path); + if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; + if (write_all(image->data, image->data_size, path)) goto end; + } + + outf("Zipping tempdir to create %s", path_out); + { + const char* path_out_leaf = strrchr(path_out, '/'); + if (!path_out_leaf) path_out_leaf = path_out; + e = systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf); + if (e) { + outf("Zip command failed to convert '%s' directory into output file: %s", + path_tempdir, path_out); + goto end; + } + } + + if (!preserve_dir) { + if (remove_directory(alloc, path_tempdir)) goto end; + } + + e = 0; + + end: + outf("e=%i", e); + extract_free(alloc, &path_tempdir); + extract_free(alloc, &path); + extract_free(alloc, &text); + extract_free(alloc, &text2); + if (f) fclose(f); + + if (e) { + outf("Failed to create %s", path_out); + } + return e; +} |