Commit 56d86df3 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Add the insertLineAfterTableOrCodeBlock function

parent 088a94e4
Loading
Loading
Loading
Loading
Loading
+21 −10
Original line number Diff line number Diff line
@@ -267,8 +267,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
	_taskID = progress.add_task('[blue]Inserting line after table or code block', total=0)
	# progress.update()

	matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
	matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
	# Also support quoted code fences in examples, e.g. "> ```sparql"
	matchCodefenceStart = re.compile(r'^\s*(?:>\s*)*```\s?.*', re.IGNORECASE)
	matchCodefenceEnd = re.compile(r'^\s*(?:>\s*)*```\s?$', re.IGNORECASE)
	matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
	matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
	matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
@@ -291,13 +292,20 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		# Note, that longer codefences are allowed by the markdown specification.
  
		if matchCodefenceStart.match(line) and not inCodefence:
			# For quoted fences (e.g. "> ```"), add an empty quoted line before
			# the fence so the previous quoted text is not absorbed as code.
			if line.lstrip().startswith('>'):
				qprefix = re.match(r'^(\s*(?:>\s*)+)', line)
				if qprefix:
					_lines.append(f'{qprefix.group(1)}\n')
			inCodefence = True
			_lines.append(line)
			continue
		if matchCodefenceEnd.match(line):
			inCodefence = False
			_lines.append(line)
			_lines.append("<br />") # insert a blank line after the code block
			_lines.append("\n")
			_lines.append("<br />")
			continue
		if inCodefence:
			_lines.append(line)
@@ -320,9 +328,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
				inTable = False
				tableHasSeparator = False
				# Mark the previous line as the last row in the table
				_lines.append("<br />") # insert a blank line after the table
				continue
				# continue with other matches
				_lines.append("\n")
				_lines.append("<br />")
				# Continue processing the current line (first line after table).

		#Detect grid tables and convert them to html table
		if matchGridTable.match(line) and not inGridTable:
@@ -344,8 +352,9 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
			else:
				inGridTable = False
				#processGridTable()
				_lines.append("<br />") # insert a blank line after the grid table
				continue
				_lines.append("\n")
				_lines.append("<br />")
				# Continue processing the current line (first line after grid table).
		# continue with other matches

		# Detect notes
@@ -358,7 +367,8 @@ def instertLineAfterTableOrCodeBlock(progress:Progress, mdLines:list[str]) -> li
		if inNote and line == "":
			inNote = False
			_lines.append(line)
			_lines.append("<br />") # insert a blank line after the note
			_lines.append("\n")
			_lines.append("<br />")
			continue
		if inNote:
			_lines.append(line)
@@ -408,6 +418,7 @@ def process(args) -> None:
			mdLines = correctTableSeparators(progress, mdLines)
		mdLines = instertLineBeforeStartOfList(progress, mdLines)
		mdLines = replaceBulletedLettersLists(progress, mdLines)
		# Spacing after code/table blocks is now handled in DOCX postprocessing.
		# mdLines = instertLineAfterTableOrCodeBlock(progress, mdLines)
		mdLines = replaceLineBreaks(progress, mdLines)
		writeMDFile(progress, mdLines, args.document, args.outDirectory)
+100 −0
Original line number Diff line number Diff line
@@ -2376,6 +2376,105 @@ def update_source_code_style(docx_input, docx_output):
        if os.path.exists(tmp_path):
            os.remove(tmp_path)


def add_break_after_code_blocks_and_tables(docx_input, docx_output):
    """
    Insert minimal separators after source-code/example paragraphs (styles
    'PL'/'EW') and
    after tables to improve separation from following body text in Word.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    body = root.find(f".//{{{ns['w']}}}body")
    if body is None:
        return

    w_p = f"{{{ns['w']}}}p"
    w_pPr = f"{{{ns['w']}}}pPr"
    w_spacing = f"{{{ns['w']}}}spacing"
    w_r = f"{{{ns['w']}}}r"
    w_br = f"{{{ns['w']}}}br"
    w_tbl = f"{{{ns['w']}}}tbl"
    w_val = f"{{{ns['w']}}}val"
    w_before = f"{{{ns['w']}}}before"
    w_after = f"{{{ns['w']}}}after"

    def _has_target_block_style(elem):
        if elem is None or elem.tag != w_p:
            return False
        style_elems = elem.xpath('./w:pPr/w:pStyle', namespaces=ns)
        return bool(style_elems and style_elems[0].get(w_val) in {"PL", "EW"})

    def _is_empty_paragraph(elem):
        if elem is None or elem.tag != w_p:
            return False
        texts = ''.join(elem.xpath('.//w:t/text()', namespaces=ns)).strip()
        return texts == '' and not elem.xpath('.//w:drawing', namespaces=ns)

    code_break_paras = 0
    table_break_paras = 0
    for elem in list(body):
        # After PL/EW paragraphs: insert one empty paragraph with zero spacing.
        if _has_target_block_style(elem):
            nxt = elem.getnext()
            if _is_empty_paragraph(nxt):
                continue

            sep_p = etree.Element(w_p)
            sep_pPr = etree.Element(w_pPr)
            sep_spacing = etree.Element(w_spacing)
            sep_spacing.set(w_before, "0")
            sep_spacing.set(w_after, "0")
            sep_pPr.append(sep_spacing)
            sep_p.append(sep_pPr)
            body.insert(body.index(elem) + 1, sep_p)
            code_break_paras += 1
            continue

        # After tables: insert one empty paragraph with zero spacing.
        if elem.tag == w_tbl:
            nxt = elem.getnext()
            if _is_empty_paragraph(nxt):
                continue

            sep_p = etree.Element(w_p)
            sep_pPr = etree.Element(w_pPr)
            sep_spacing = etree.Element(w_spacing)
            sep_spacing.set(w_before, "0")
            sep_spacing.set(w_after, "0")
            sep_pPr.append(sep_spacing)
            sep_p.append(sep_pPr)
            body.insert(body.index(elem) + 1, sep_p)
            table_break_paras += 1

    print(
        f'Inserted minimal separators: post-code/example paragraphs={code_break_paras}, '
        f'post-table paragraphs={table_break_paras}'
    )

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)

    try:
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        shutil.move(tmp_path, docx_output)
        os.chmod(docx_output, 0o644)
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_equation_style(docx_input, docx_output):
    """
    Updates equation styles from 'Equation' to 'EQ' in a DOCX file.
@@ -2539,6 +2638,7 @@ def update_format_styles_cli():
    add_no_break_hyphens(args.docx_input, args.docx_output)
    update_references_style(args.docx_input, args.docx_output)
    update_source_code_style(args.docx_input, args.docx_output)
    add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output)
    update_equation_style(args.docx_input, args.docx_output)
    correct_quotes_docx(args.docx_input, args.docx_output)