/* These extract_docx_*() functions generate docx content and docx zip archive data. Caller must call things in a sensible order to create valid content - e.g. don't call docx_paragraph_start() twice without intervening call to docx_paragraph_finish(). */ #include "../include/extract.h" #include "docx_template.h" #include "astring.h" #include "document.h" #include "docx.h" #include "mem.h" #include "memento.h" #include "outf.h" #include "zip.h" #include #include #include #include #include #include #include #include static int extract_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "\n\n"); } static int extract_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "\n"); } static int extract_docx_run_start( extract_alloc_t* alloc, extract_astring_t* content, const char* font_name, double font_size, int bold, int italic ) /* Starts a new run. Caller must ensure that extract_docx_run_finish() was called to terminate any previous run. */ { int e = 0; if (!e) e = extract_astring_cat(alloc, content, "\n"); if (!e && bold) e = extract_astring_cat(alloc, content, ""); if (!e && italic) e = extract_astring_cat(alloc, content, ""); { char font_size_text[32]; if (0) font_size = 10; if (!e) e = extract_astring_cat(alloc, content, ""); if (!e) e = extract_astring_cat(alloc, content, ""); } if (!e) e = extract_astring_cat(alloc, content, ""); return e; } static int extract_docx_run_finish(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, ""); } static int extract_docx_char_append_string(extract_alloc_t* alloc, extract_astring_t* content, const char* text) { return extract_astring_cat(alloc, content, text); } static int extract_docx_char_append_stringf(extract_alloc_t* alloc, extract_astring_t* content, const char* format, ...) { char* buffer = NULL; int e; va_list va; va_start(va, format); e = extract_vasprintf(alloc, &buffer, format, va); va_end(va); if (e < 0) return e; e = extract_astring_cat(alloc, content, buffer); extract_free(alloc, &buffer); return e; } static int extract_docx_char_append_char(extract_alloc_t* alloc, extract_astring_t* content, char c) { return extract_astring_catc(alloc, content, c); } static int extract_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content) /* Append an empty paragraph to *content. */ { int e = -1; if (extract_docx_paragraph_start(alloc, content)) goto end; /* It seems like our choice of font size here doesn't make any difference to the ammount of vertical space, unless we include a non-space character. Presumably something to do with the styles in the template document. */ if (extract_docx_run_start( alloc, content, "OpenSans", 10 /*font_size*/, 0 /*font_bold*/, 0 /*font_italic*/ )) goto end; //docx_char_append_string(content, " "); /*   is non-break space. */ if (extract_docx_run_finish(alloc, content)) goto end; if (extract_docx_paragraph_finish(alloc, content)) goto end; e = 0; end: return e; } /* Removes last chars. */ static int docx_char_truncate(extract_astring_t* content, int len) { assert((size_t) len <= content->chars_num); content->chars_num -= len; content->chars[content->chars_num] = 0; return 0; } static int extract_docx_char_truncate_if(extract_astring_t* content, char c) /* Removes last char if it is . */ { if (content->chars_num && content->chars[content->chars_num-1] == c) { docx_char_truncate(content, 1); } return 0; } static double matrices_to_font_size(matrix_t* ctm, matrix_t* trm) { double font_size = matrix_expansion(*trm) * matrix_expansion(*ctm); /* Round font_size to nearest 0.01. */ font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; return font_size; } typedef struct { const char* font_name; double font_size; int font_bold; int font_italic; matrix_t* ctm_prev; } content_state_t; /* Used to keep track of font information when writing paragraphs of docx content, e.g. so we know whether a font has changed so need to start a new docx span. */ static int extract_document_to_docx_content_paragraph( extract_alloc_t* alloc, content_state_t* state, paragraph_t* paragraph, extract_astring_t* content ) /* Append docx xml for to . Updates *state if we change font. */ { int e = -1; int l; if (extract_docx_paragraph_start(alloc, content)) goto end; for (l=0; llines_num; ++l) { line_t* line = paragraph->lines[l]; int s; for (s=0; sspans_num; ++s) { int si; span_t* span = line->spans[s]; double font_size_new; state->ctm_prev = &span->ctm; font_size_new = matrices_to_font_size(&span->ctm, &span->trm); if (!state->font_name || strcmp(span->font_name, state->font_name) || span->font_bold != state->font_bold || span->font_italic != state->font_italic || font_size_new != state->font_size ) { if (state->font_name) { if (extract_docx_run_finish(alloc, content)) goto end; } state->font_name = span->font_name; state->font_bold = span->font_bold; state->font_italic = span->font_italic; state->font_size = font_size_new; if (extract_docx_run_start( alloc, content, state->font_name, state->font_size, state->font_bold, state->font_italic )) goto end; } for (si=0; sichars_num; ++si) { char_t* char_ = &span->chars[si]; int c = char_->ucs; if (0) {} /* Escape XML special characters. */ else if (c == '<') extract_docx_char_append_string(alloc, content, "<"); else if (c == '>') extract_docx_char_append_string(alloc, content, ">"); else if (c == '&') extract_docx_char_append_string(alloc, content, "&"); else if (c == '"') extract_docx_char_append_string(alloc, content, """); else if (c == '\'') extract_docx_char_append_string(alloc, content, "'"); /* Expand ligatures. */ else if (c == 0xFB00) { if (extract_docx_char_append_string(alloc, content, "ff")) goto end; } else if (c == 0xFB01) { if (extract_docx_char_append_string(alloc, content, "fi")) goto end; } else if (c == 0xFB02) { if (extract_docx_char_append_string(alloc, content, "fl")) goto end; } else if (c == 0xFB03) { if (extract_docx_char_append_string(alloc, content, "ffi")) goto end; } else if (c == 0xFB04) { if (extract_docx_char_append_string(alloc, content, "ffl")) goto end; } /* Output ASCII verbatim. */ else if (c >= 32 && c <= 127) { if (extract_docx_char_append_char(alloc, content, (char) c)) goto end; } /* Escape all other characters. */ else { char buffer[32]; snprintf(buffer, sizeof(buffer), "&#x%x;", c); if (extract_docx_char_append_string(alloc, content, buffer)) goto end; } } /* Remove any trailing '-' at end of line. */ if (extract_docx_char_truncate_if(content, '-')) goto end; } } if (state->font_name) { if (extract_docx_run_finish(alloc, content)) goto end; state->font_name = NULL; } if (extract_docx_paragraph_finish(alloc, content)) goto end; e = 0; end: return e; } static int extract_document_append_image( extract_alloc_t* alloc, extract_astring_t* content, image_t* image ) /* Write reference to image into docx content. */ { extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," \n", image->id); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); //extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, "\n"); return 0; } static int extract_document_output_rotated_paragraphs( extract_alloc_t* alloc, page_t* page, int paragraph_begin, int paragraph_end, int rot, int x, int y, int w, int h, int text_box_id, extract_astring_t* content, content_state_t* state ) /* Writes paragraph to content inside rotated text box. */ { int e = 0; int p; outf("x,y=%ik,%ik = %i,%i", x/1000, y/1000, x, y); extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," %i\n", x); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," %i\n", y); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," \n", w, h); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," \n", text_box_id, text_box_id); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," \n", rot); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); /* Give box a solid background. */ if (0) { extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); } /* Draw line around box. */ if (0) { extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); } extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " "); #if 0 if (0) { /* Output inline text describing the rotation. */ extract_docx_char_append_stringf(content, "\n" "*** rotate: %f rad, %f deg. rot=%i\n" "\n", rotate, rotate * 180 / pi, rot ); } #endif /* Output paragraphs p0..p2-1. */ for (p=paragraph_begin; pparagraphs[p]; if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); /* This fallback is copied from a real Word document. Not sure whether it works - both Libreoffice and Word use the above choice. */ extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_stringf(alloc, content," \n", text_box_id); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " "); for (p=paragraph_begin; pparagraphs[p]; if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, "\n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, " \n"); extract_docx_char_append_string(alloc, content, ""); e = 0; end: return e; } int extract_document_to_docx_content( extract_alloc_t* alloc, document_t* document, int spacing, int rotation, int images, extract_astring_t* content ) { int ret = -1; int text_box_id = 0; int p; /* Write paragraphs into . */ for (p=0; ppages_num; ++p) { page_t* page = document->pages[p]; int p; content_state_t state; state.font_name = NULL; state.font_size = 0; state.font_bold = 0; state.font_italic = 0; state.ctm_prev = NULL; for (p=0; pparagraphs_num; ++p) { paragraph_t* paragraph = page->paragraphs[p]; const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; double rotate = atan2(ctm->b, ctm->a); if (spacing && state.ctm_prev && paragraph->lines_num && paragraph->lines[0]->spans_num && matrix_cmp4( state.ctm_prev, ¶graph->lines[0]->spans[0]->ctm ) ) { /* Extra vertical space between paragraphs that were at different angles in the original document. */ if (extract_docx_paragraph_empty(alloc, content)) goto end; } if (spacing) { /* Extra vertical space between paragraphs. */ if (extract_docx_paragraph_empty(alloc, content)) goto end; } if (rotation && rotate != 0) { /* Find extent of paragraphs with this same rotation. extent will contain max width and max height of paragraphs, in units before application of ctm, i.e. before rotation. */ point_t extent = {0, 0}; int p0 = p; int p1; outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", rotate, rotate * 180 / pi, ctm->e, ctm->f, ctm->a, ctm->b, ctm->c, ctm->d ); { /* We assume that first span is at origin of text block. This assumes left-to-right text. */ double rotate0 = rotate; const matrix_t* ctm0 = ctm; point_t origin = { paragraph->lines[0]->spans[0]->chars[0].x, paragraph->lines[0]->spans[0]->chars[0].y }; matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; if (ctm_det != 0) { ctm_inverse.a = +ctm->d / ctm_det; ctm_inverse.b = -ctm->b / ctm_det; ctm_inverse.c = -ctm->c / ctm_det; ctm_inverse.d = +ctm->a / ctm_det; } else { outf("cannot invert ctm=(%f %f %f %f)", ctm->a, ctm->b, ctm->c, ctm->d); } for (p=p0; pparagraphs_num; ++p) { paragraph = page->paragraphs[p]; ctm = ¶graph->lines[0]->spans[0]->ctm; rotate = atan2(ctm->b, ctm->a); if (rotate != rotate0) { break; } /* Update . */ { int l; for (l=0; llines_num; ++l) { line_t* line = paragraph->lines[l]; span_t* span = line_span_last(line); char_t* char_ = span_char_last(span); double adv = char_->adv * matrix_expansion(span->trm); double x = char_->x + adv * cos(rotate); double y = char_->y + adv * sin(rotate); double dx = x - origin.x; double dy = y - origin.y; /* Position relative to origin and before box rotation. */ double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; yy = -yy; if (xx > extent.x) extent.x = xx; if (yy > extent.y) extent.y = yy; if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span)); } } } p1 = p; rotate = rotate0; ctm = ctm0; outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", rotate, p0, p1, extent.x, extent.y); } /* Paragraphs p0..p1-1 have same rotation. We output them into a single rotated text box. */ /* We need unique id for text box. */ text_box_id += 1; { /* Angles are in units of 1/60,000 degree. */ int rot = (int) (rotate * 180 / pi * 60000); /* about the middle. */ double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ int x = (int) (ctm->e * point_to_emu); int y = (int) (ctm->f * point_to_emu); int w = (int) (extent.x * point_to_emu); int h = (int) (extent.y * point_to_emu); int dx; int dy; if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot); h *= 2; /* We can't predict how much space Word will actually require for the rotated text, so make the box have the original width but allow text to take extra vertical space. There doesn't seem to be a way to make the text box auto-grow to contain the text. */ dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0); dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0); outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik", ctm->e, ctm->f, rotate * 180/pi, x/1000, y/1000, dx/1000, dy/1000 ); x -= dx; y -= -dy; if (extract_document_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, text_box_id, content, &state)) goto end; } p = p1 - 1; //p = page->paragraphs_num - 1; } else { if (extract_document_to_docx_content_paragraph(alloc, &state, paragraph, content)) goto end; } } if (images) { int i; for (i=0; iimages_num; ++i) { extract_document_append_image(alloc, content, &page->images[i]); } } } ret = 0; end: return ret; } static int systemf(extract_alloc_t* alloc, const char* format, ...) /* Like system() but takes printf-style format and args. Also, if we return +ve we set errno to EIO. */ { int e; char* command; va_list va; va_start(va, format); e = extract_vasprintf(alloc, &command, format, va); va_end(va); if (e < 0) return e; outf("running: %s", command); e = system(command); extract_free(alloc, &command); if (e > 0) { errno = EIO; } return e; } static int read_all(extract_alloc_t* alloc, FILE* in, char** o_out) /* Reads until eof into zero-terminated malloc'd buffer. */ { size_t len = 0; size_t delta = 128; for(;;) { size_t n; if (extract_realloc2(alloc, o_out, len, len + delta + 1)) { extract_free(alloc, o_out); return -1; } n = fread(*o_out + len, 1 /*size*/, delta /*nmemb*/, in); len += n; if (feof(in)) { (*o_out)[len] = 0; return 0; } if (ferror(in)) { /* It's weird that fread() and ferror() don't set errno. */ errno = EIO; extract_free(alloc, o_out); return -1; } } } static int read_all_path(extract_alloc_t* alloc, const char* path, char** o_text) /* Reads entire file into zero-terminated malloc'd buffer. */ { int e = -1; FILE* f = NULL; f = fopen(path, "rb"); if (!f) goto end; if (read_all(alloc, f, o_text)) goto end; e = 0; end: if (f) fclose(f); if (e) extract_free(alloc, &o_text); return e; } static int write_all(const void* data, size_t data_size, const char* path) { int e = -1; FILE* f = fopen(path, "w"); if (!f) goto end; if (fwrite(data, data_size, 1 /*nmemb*/, f) != 1) goto end; e = 0; end: if (f) fclose(f); return e; } static int extract_docx_content_insert( extract_alloc_t* alloc, const char* original, const char* mid_begin_name, const char* mid_end_name, extract_astring_t* contentss, int contentss_num, char** o_out ) /* Creates a string consisting of with all strings in inserted into 's ... region, and appends this string to *o_out. */ { int e = -1; const char* mid_begin; const char* mid_end; extract_astring_t out; extract_astring_init(&out); mid_begin = strstr(original, mid_begin_name); if (!mid_begin) { outf("error: could not find '%s' in docx content", mid_begin_name); errno = ESRCH; goto end; } mid_begin += strlen(mid_begin_name); mid_end = strstr(mid_begin, mid_end_name); if (!mid_end) { outf("error: could not find '%s' in docx content", mid_end_name); errno = ESRCH; goto end; } if (extract_astring_catl(alloc, &out, original, mid_begin - original)) goto end; { int i; for (i=0; i in , and *o_end to beginning of first occurtence of in . */ { *o_begin = strstr(text, begin); if (!*o_begin) goto fail; *o_begin += strlen(begin); *o_end = strstr(*o_begin, end); if (!*o_end) goto fail; return 0; fail: errno = ESRCH; return -1; } int extract_docx_content_item( extract_alloc_t* alloc, extract_astring_t* contentss, int contentss_num, images_t* images, const char* name, const char* text, char** text2 ) { int e = -1; extract_astring_t temp; extract_astring_init(&temp); *text2 = NULL; if (0) {} else if (!strcmp(name, "[Content_Types].xml")) { /* Add information about all image types that we are going to use. */ const char* begin; const char* end; const char* insert; int it; extract_astring_free(alloc, &temp); outf("text: %s", text); if (s_find_mid(text, "", &begin, &end)) goto end; insert = begin; insert = strchr(insert, '>'); assert(insert); insert += 1; if (extract_astring_catl(alloc, &temp, text, insert - text)) goto end; outf("images->imagetypes_num=%i", images->imagetypes_num); for (it=0; itimagetypes_num; ++it) { const char* imagetype = images->imagetypes[it]; if (extract_astring_cat(alloc, &temp, "")) goto end; } if (extract_astring_cat(alloc, &temp, insert)) goto end; *text2 = temp.chars; extract_astring_init(&temp); } else if (!strcmp(name, "word/_rels/document.xml.rels")) { /* Add relationships between image ids and image names within docx archive. */ const char* begin; const char* end; int j; extract_astring_free(alloc, &temp); if (s_find_mid(text, "", &begin, &end)) goto end; if (extract_astring_catl(alloc, &temp, text, end - text)) goto end; outf("images.images_num=%i", images->images_num); for (j=0; jimages_num; ++j) { image_t* image = &images->images[j]; if (extract_astring_cat(alloc, &temp, "id)) goto end; if (extract_astring_cat(alloc, &temp, "\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image\" Target=\"media/")) goto end; if (extract_astring_cat(alloc, &temp, image->name)) goto end; if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; } if (extract_astring_cat(alloc, &temp, end)) goto end; *text2 = temp.chars; extract_astring_init(&temp); } else if (!strcmp(name, "word/document.xml")) { /* Insert paragraphs content. */ if (extract_docx_content_insert( alloc, text, "", "", contentss, contentss_num, text2 )) goto end; } else { *text2 = NULL; } e = 0; end: if (e) { /* We might have set to new content. */ extract_free(alloc, text2); /* We might have used as a temporary buffer. */ extract_astring_free(alloc, &temp); } extract_astring_init(&temp); return e; } static int check_path_shell_safe(const char* path) /* Returns -1 with errno=EINVAL if contains sequences that could make it unsafe in shell commands. */ { if (0 || strstr(path, "..") || strchr(path, '\'') || strchr(path, '"') || strchr(path, ' ') ) { errno = EINVAL; return -1; } return 0; } static int remove_directory(extract_alloc_t* alloc, const char* path) { if (check_path_shell_safe(path)) { outf("path_out is unsafe: %s", path); return -1; } return systemf(alloc, "rm -r '%s'", path); } #ifdef _WIN32 #include static int s_mkdir(const char* path, int mode) { (void) mode; return _mkdir(path); } #else static int s_mkdir(const char* path, int mode) { return mkdir(path, mode); } #endif int extract_docx_write_template( extract_alloc_t* alloc, extract_astring_t* contentss, int contentss_num, images_t* images, const char* path_template, const char* path_out, int preserve_dir ) { int e = -1; int i; char* path_tempdir = NULL; FILE* f = NULL; char* path = NULL; char* text = NULL; char* text2 = NULL; assert(path_out); assert(path_template); if (check_path_shell_safe(path_out)) { outf("path_out is unsafe: %s", path_out); goto end; } outf("images->images_num=%i", images->images_num); if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end; if (systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end; if (s_mkdir(path_tempdir, 0777)) { outf("Failed to create directory: %s", path_tempdir); goto end; } outf("Unzipping template document '%s' to tempdir: %s", path_template, path_tempdir); e = systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template); if (e) { outf("Failed to unzip %s into %s", path_template, path_tempdir); goto end; } /* Might be nice to iterate through all items in path_tempdir, but for now we look at just the items that we know extract_docx_content_item() will modify. */ { const char* names[] = { "word/document.xml", "[Content_Types].xml", "word/_rels/document.xml.rels", }; int names_num = sizeof(names) / sizeof(names[0]); for (i=0; i/media/. */ extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; if (s_mkdir(path, 0777)) goto end; for (i=0; iimages_num; ++i) { image_t* image = &images->images[i]; extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; if (write_all(image->data, image->data_size, path)) goto end; } outf("Zipping tempdir to create %s", path_out); { const char* path_out_leaf = strrchr(path_out, '/'); if (!path_out_leaf) path_out_leaf = path_out; e = systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf); if (e) { outf("Zip command failed to convert '%s' directory into output file: %s", path_tempdir, path_out); goto end; } } if (!preserve_dir) { if (remove_directory(alloc, path_tempdir)) goto end; } e = 0; end: outf("e=%i", e); extract_free(alloc, &path_tempdir); extract_free(alloc, &path); extract_free(alloc, &text); extract_free(alloc, &text2); if (f) fclose(f); if (e) { outf("Failed to create %s", path_out); } return e; }