Loading generateBaseline/pandocFilter.py +21 −10 Original line number Diff line number Diff line Loading @@ -267,8 +267,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li _taskID = progress.add_task('[blue]Inserting line after table or code block', total=0) # progress.update() matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) # Also support quoted code fences in examples, e.g. "> ```sparql" matchCodefenceStart = re.compile(r'^\s*(?:>\s*)*```\s?.*', re.IGNORECASE) matchCodefenceEnd = re.compile(r'^\s*(?:>\s*)*```\s?$', re.IGNORECASE) matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) Loading @@ -291,13 +292,20 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li # Note, that longer codefences are allowed by the markdown specification. if matchCodefenceStart.match(line) and not inCodefence: # For quoted fences (e.g. "> ```"), add an empty quoted line before # the fence so the previous quoted text is not absorbed as code. if line.lstrip().startswith('>'): qprefix = re.match(r'^(\s*(?:>\s*)+)', line) if qprefix: _lines.append(f'{qprefix.group(1)}\n') inCodefence = True _lines.append(line) continue if matchCodefenceEnd.match(line): inCodefence = False _lines.append(line) _lines.append("<br />") # insert a blank line after the code block _lines.append("\n") _lines.append("<br />") continue if inCodefence: _lines.append(line) Loading @@ -320,9 +328,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li inTable = False tableHasSeparator = False # Mark the previous line as the last row in the table _lines.append("<br />") # insert a blank line after the table continue # continue with other matches _lines.append("\n") _lines.append("<br />") # Continue processing the current line (first line after table). #Detect grid tables and convert them to html table if matchGridTable.match(line) and not inGridTable: Loading @@ -344,8 +352,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li else: inGridTable = False #processGridTable() _lines.append("<br />") # insert a blank line after the grid table continue _lines.append("\n") _lines.append("<br />") # Continue processing the current line (first line after grid table). # continue with other matches # Detect notes Loading @@ -358,7 +367,8 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li if inNote and line == "": inNote = False _lines.append(line) _lines.append("<br />") # insert a blank line after the note _lines.append("\n") _lines.append("<br />") continue if inNote: _lines.append(line) Loading Loading @@ -408,6 +418,7 @@ def process(args) -> None: mdLines = correctTableSeparators(progress, mdLines) mdLines = instertLineBeforeStartOfList(progress, mdLines) mdLines = replaceBulletedLettersLists(progress, mdLines) # Spacing after code/table blocks is now handled in DOCX postprocessing. # mdLines = instertLineAfterTableOrCodeBlock(progress, mdLines) mdLines = replaceLineBreaks(progress, mdLines) writeMDFile(progress, mdLines, args.document, args.outDirectory) Loading generateBaseline/postprocessing.py +100 −0 Original line number Diff line number Diff line Loading @@ -2376,6 +2376,105 @@ def update_source_code_style(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def add_break_after_code_blocks_and_tables(docx_input, docx_output): """ Insert minimal separators after source-code/example paragraphs (styles 'PL'/'EW') and after tables to improve separation from following body text in Word. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) body = root.find(f".//{{{ns['w']}}}body") if body is None: return w_p = f"{{{ns['w']}}}p" w_pPr = f"{{{ns['w']}}}pPr" w_spacing = f"{{{ns['w']}}}spacing" w_r = f"{{{ns['w']}}}r" w_br = f"{{{ns['w']}}}br" w_tbl = f"{{{ns['w']}}}tbl" w_val = f"{{{ns['w']}}}val" w_before = f"{{{ns['w']}}}before" w_after = f"{{{ns['w']}}}after" def _has_target_block_style(elem): if elem is None or elem.tag != w_p: return False style_elems = elem.xpath('./w:pPr/w:pStyle', namespaces=ns) return bool(style_elems and style_elems[0].get(w_val) in {"PL", "EW"}) def _is_empty_paragraph(elem): if elem is None or elem.tag != w_p: return False texts = ''.join(elem.xpath('.//w:t/text()', namespaces=ns)).strip() return texts == '' and not elem.xpath('.//w:drawing', namespaces=ns) code_break_paras = 0 table_break_paras = 0 for elem in list(body): # After PL/EW paragraphs: insert one empty paragraph with zero spacing. if _has_target_block_style(elem): nxt = elem.getnext() if _is_empty_paragraph(nxt): continue sep_p = etree.Element(w_p) sep_pPr = etree.Element(w_pPr) sep_spacing = etree.Element(w_spacing) sep_spacing.set(w_before, "0") sep_spacing.set(w_after, "0") sep_pPr.append(sep_spacing) sep_p.append(sep_pPr) body.insert(body.index(elem) + 1, sep_p) code_break_paras += 1 continue # After tables: insert one empty paragraph with zero spacing. if elem.tag == w_tbl: nxt = elem.getnext() if _is_empty_paragraph(nxt): continue sep_p = etree.Element(w_p) sep_pPr = etree.Element(w_pPr) sep_spacing = etree.Element(w_spacing) sep_spacing.set(w_before, "0") sep_spacing.set(w_after, "0") sep_pPr.append(sep_spacing) sep_p.append(sep_pPr) body.insert(body.index(elem) + 1, sep_p) table_break_paras += 1 print( f'Inserted minimal separators: post-code/example paragraphs={code_break_paras}, ' f'post-table paragraphs={table_break_paras}' ) xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, docx_output) os.chmod(docx_output, 0o644) finally: if os.path.exists(tmp_path): os.remove(tmp_path) def update_equation_style(docx_input, docx_output): """ Updates equation styles from 'Equation' to 'EQ' in a DOCX file. Loading Loading @@ -2539,6 +2638,7 @@ def update_format_styles_cli(): add_no_break_hyphens(args.docx_input, args.docx_output) update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) Loading Loading
generateBaseline/pandocFilter.py +21 −10 Original line number Diff line number Diff line Loading @@ -267,8 +267,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li _taskID = progress.add_task('[blue]Inserting line after table or code block', total=0) # progress.update() matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) # Also support quoted code fences in examples, e.g. "> ```sparql" matchCodefenceStart = re.compile(r'^\s*(?:>\s*)*```\s?.*', re.IGNORECASE) matchCodefenceEnd = re.compile(r'^\s*(?:>\s*)*```\s?$', re.IGNORECASE) matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) Loading @@ -291,13 +292,20 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li # Note, that longer codefences are allowed by the markdown specification. if matchCodefenceStart.match(line) and not inCodefence: # For quoted fences (e.g. "> ```"), add an empty quoted line before # the fence so the previous quoted text is not absorbed as code. if line.lstrip().startswith('>'): qprefix = re.match(r'^(\s*(?:>\s*)+)', line) if qprefix: _lines.append(f'{qprefix.group(1)}\n') inCodefence = True _lines.append(line) continue if matchCodefenceEnd.match(line): inCodefence = False _lines.append(line) _lines.append("<br />") # insert a blank line after the code block _lines.append("\n") _lines.append("<br />") continue if inCodefence: _lines.append(line) Loading @@ -320,9 +328,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li inTable = False tableHasSeparator = False # Mark the previous line as the last row in the table _lines.append("<br />") # insert a blank line after the table continue # continue with other matches _lines.append("\n") _lines.append("<br />") # Continue processing the current line (first line after table). #Detect grid tables and convert them to html table if matchGridTable.match(line) and not inGridTable: Loading @@ -344,8 +352,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li else: inGridTable = False #processGridTable() _lines.append("<br />") # insert a blank line after the grid table continue _lines.append("\n") _lines.append("<br />") # Continue processing the current line (first line after grid table). # continue with other matches # Detect notes Loading @@ -358,7 +367,8 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li if inNote and line == "": inNote = False _lines.append(line) _lines.append("<br />") # insert a blank line after the note _lines.append("\n") _lines.append("<br />") continue if inNote: _lines.append(line) Loading Loading @@ -408,6 +418,7 @@ def process(args) -> None: mdLines = correctTableSeparators(progress, mdLines) mdLines = instertLineBeforeStartOfList(progress, mdLines) mdLines = replaceBulletedLettersLists(progress, mdLines) # Spacing after code/table blocks is now handled in DOCX postprocessing. # mdLines = instertLineAfterTableOrCodeBlock(progress, mdLines) mdLines = replaceLineBreaks(progress, mdLines) writeMDFile(progress, mdLines, args.document, args.outDirectory) Loading
generateBaseline/postprocessing.py +100 −0 Original line number Diff line number Diff line Loading @@ -2376,6 +2376,105 @@ def update_source_code_style(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def add_break_after_code_blocks_and_tables(docx_input, docx_output): """ Insert minimal separators after source-code/example paragraphs (styles 'PL'/'EW') and after tables to improve separation from following body text in Word. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) body = root.find(f".//{{{ns['w']}}}body") if body is None: return w_p = f"{{{ns['w']}}}p" w_pPr = f"{{{ns['w']}}}pPr" w_spacing = f"{{{ns['w']}}}spacing" w_r = f"{{{ns['w']}}}r" w_br = f"{{{ns['w']}}}br" w_tbl = f"{{{ns['w']}}}tbl" w_val = f"{{{ns['w']}}}val" w_before = f"{{{ns['w']}}}before" w_after = f"{{{ns['w']}}}after" def _has_target_block_style(elem): if elem is None or elem.tag != w_p: return False style_elems = elem.xpath('./w:pPr/w:pStyle', namespaces=ns) return bool(style_elems and style_elems[0].get(w_val) in {"PL", "EW"}) def _is_empty_paragraph(elem): if elem is None or elem.tag != w_p: return False texts = ''.join(elem.xpath('.//w:t/text()', namespaces=ns)).strip() return texts == '' and not elem.xpath('.//w:drawing', namespaces=ns) code_break_paras = 0 table_break_paras = 0 for elem in list(body): # After PL/EW paragraphs: insert one empty paragraph with zero spacing. if _has_target_block_style(elem): nxt = elem.getnext() if _is_empty_paragraph(nxt): continue sep_p = etree.Element(w_p) sep_pPr = etree.Element(w_pPr) sep_spacing = etree.Element(w_spacing) sep_spacing.set(w_before, "0") sep_spacing.set(w_after, "0") sep_pPr.append(sep_spacing) sep_p.append(sep_pPr) body.insert(body.index(elem) + 1, sep_p) code_break_paras += 1 continue # After tables: insert one empty paragraph with zero spacing. if elem.tag == w_tbl: nxt = elem.getnext() if _is_empty_paragraph(nxt): continue sep_p = etree.Element(w_p) sep_pPr = etree.Element(w_pPr) sep_spacing = etree.Element(w_spacing) sep_spacing.set(w_before, "0") sep_spacing.set(w_after, "0") sep_pPr.append(sep_spacing) sep_p.append(sep_pPr) body.insert(body.index(elem) + 1, sep_p) table_break_paras += 1 print( f'Inserted minimal separators: post-code/example paragraphs={code_break_paras}, ' f'post-table paragraphs={table_break_paras}' ) xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, docx_output) os.chmod(docx_output, 0o644) finally: if os.path.exists(tmp_path): os.remove(tmp_path) def update_equation_style(docx_input, docx_output): """ Updates equation styles from 'Equation' to 'EQ' in a DOCX file. Loading Loading @@ -2539,6 +2638,7 @@ def update_format_styles_cli(): add_no_break_hyphens(args.docx_input, args.docx_output) update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) Loading