From 757d182cc3aea7dfeb6afd6b2b0788f330400363 Mon Sep 17 00:00:00 2001 From: Naum Spaseski Date: Mon, 15 Dec 2025 16:05:43 +0100 Subject: [PATCH 1/2] URI and other corrections, mostly inspired from NFV docs --- spec2md.py | 327 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 318 insertions(+), 9 deletions(-) diff --git a/spec2md.py b/spec2md.py index c2f9d58..0d8da69 100644 --- a/spec2md.py +++ b/spec2md.py @@ -153,7 +153,8 @@ class DocumentConfiguration(object): self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False) self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True) self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True) - + self.pandocTableWarnings = config.getboolean('general', 'pandocTableWarnings', fallback = True) + self.escapeQuotations = config.getboolean('general', 'escapeQuotations', fallback = False) # Paragraphs self.paragraphs = { c : config.getlist('paragraphs', c) # type: ignore [attr-defined] for c in config['paragraphs'] } @@ -248,6 +249,7 @@ def processDocuments(documents:list[str], headers:list[Tuple[int, str]] = [] emfFiles:list[str] = [] referencedImages:list[str] = [] + lastListIndent:Dict[str, int] = {} footnotes:dict[str, str] = {} global _print @@ -356,6 +358,50 @@ def processDocuments(documents:list[str], ) newParagraphs = 0 + def _isURILike(text:str) -> bool: + """ Check if text appears to be a URI or code-like structure. + """ + if not text: + return False + # Check for URI patterns: contains slashes, curly braces, or common URI components + uri_indicators = ['/', '{', '}', 'apiRoot', 'apiMajorVersion', 'http', '://'] + # Also check for variable patterns like {variableName} + has_curly_braces = '{' in text and '}' in text + has_slashes = '/' in text + has_uri_keywords = any(indicator.lower() in text.lower() for indicator in uri_indicators) + return has_curly_braces or (has_slashes and has_uri_keywords) + + def _isAPIReference(text:str) -> bool: + """ Check if text appears to be an API reference (e.g., VnfInstances.Post.201, VnfInstancesPostRequest). + """ + if not text: + return False + # Pattern: ComponentName.Method.StatusCode (e.g., VnfInstances.Post.201) + if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+$', text.strip()): + return True + # Pattern: ComponentNameMethodRequest/Response (e.g., VnfInstancesPostRequest) + if re.match(r'^[A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response)$', text.strip()): + return True + # Pattern: ComponentName with method (e.g., VnfInstances.Post) + if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)$', text.strip()): + return True + return False + + def _getRunFormatting(runElement:ET.Element) -> Tuple[bool, bool]: + """ Extract bold and italic formatting from a run element. + """ + bold = False + italic = False + for e in runElement: + if strippedTag(e.tag) == 'rPr': + for ep in e: + match strippedTag(ep.tag): + case 'b' if ep.attrib.get(_val, 'true') == 'true': + bold = True + case 'i' if ep.attrib.get(_val, 'true') == 'true': + italic = True + return (bold, italic) + def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str: """ Recursively parse a document paragraph. """ @@ -365,12 +411,101 @@ def processDocuments(documents:list[str], tag = strippedTag(element.tag) # remove namespaces for easier handlings match tag: case 'p': - for x in element: - _result += _parseXML(x, inCell) + # Collect all runs first to merge consecutive runs with same formatting + current_run_text = '' + current_bold = False + current_italic = False + + def _outputCurrentRun() -> str: + """ Output the current accumulated run with proper formatting. + """ + nonlocal current_run_text, current_bold, current_italic + if not current_run_text: + return '' + + # Determine if this looks like a URI or API reference + is_uri = _isURILike(current_run_text) + is_api_ref = _isAPIReference(current_run_text) + + # Process text + _s = str(toMD(current_run_text)) + # Don't strip whitespace for URIs or if not bold/italic + if not is_uri and (current_bold or current_italic): + _s = _s.strip() + + # For URIs and API references that are bold, use inline code instead + if (is_uri or is_api_ref) and current_bold: + # Use inline code instead of bold for URIs and API references + # Replace single * or _ (but not needed in code blocks) + _s = _s.replace('`', '\\`') # Escape backticks + result = f'`{_s}` ' + else: + # Replace single * or _ + _s = _s.replace('_', '\\_') + _s = _s.replace('*', '\\*') + if docConfig.escapeQuotations: + _s = _s.replace('"', '\\"') # Escape double quotes + + # Apply formatting + _bold_marker = '**' if current_bold else '' + _italic_marker = '_' if current_italic else '' + + # Add trailing space only if formatting is applied and not URI-like + _postfix = ' ' if (current_bold or current_italic) and not is_uri else '' + result = f'{_bold_marker}{_italic_marker}{_s}{_italic_marker}{_bold_marker}{_postfix}' + + # Reset current run + current_run_text = '' + current_bold = False + current_italic = False + + return result + + for child in element: + child_tag = strippedTag(child.tag) + if child_tag == 'r': + # Get formatting for this run + run_bold, run_italic = _getRunFormatting(child) + # Extract text from this run + run_text = '' + for text_elem in child: + if strippedTag(text_elem.tag) == 't': + run_text += str(text_elem.text) if text_elem.text else '' + elif strippedTag(text_elem.tag) == 'br': + run_text += _linebreak + elif strippedTag(text_elem.tag) == 'tab': + run_text += ' ' + # Handle other inline elements recursively + else: + run_text += _parseXML(text_elem, inCell) + + # Merge with previous run if formatting matches + if run_text and (run_bold == current_bold and run_italic == current_italic and current_run_text): + current_run_text += run_text + else: + # Output previous run if any + if current_run_text: + _result += _outputCurrentRun() + # Start new run + current_run_text = run_text + current_bold = run_bold + current_italic = run_italic + else: + # Non-run element - output current run first, then process the element + if current_run_text: + _result += _outputCurrentRun() + _result += _parseXML(child, inCell) + + # Output last run if any + if current_run_text: + _result += _outputCurrentRun() + case 'r': + # For runs not in paragraphs (shouldn't happen normally, but handle gracefully) for x in element: _result += _parseXML(x, inCell) case 't': + # Fallback for text elements not processed in paragraph merge _bold = '' _italics = '' for e in element.getparent(): @@ -384,13 +519,18 @@ def processDocuments(documents:list[str], # case _: # _print(f'[yellow]unsupported style: {ep.tag}') - # Strip white spaces if bold or italics - _s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text))) + _text = str(toMD(str(element.text))) if element.text else '' + is_uri = _isURILike(_text) + + # Strip white spaces if bold or italics, but not for URIs + _s = _text.strip() if (_bold or _italics) and not is_uri else _text # Replace single * or _ _s = _s.replace('_', '\\_') _s = _s.replace('*', '\\*') - # Add trailing white space when bold or italics - _postfix = ' ' if _bold or _italics else '' + if docConfig.escapeQuotations: + _s = _s.replace('"', '\\"') # Escape double quotes + # Add trailing white space when bold or italics, but not for URIs + _postfix = ' ' if (_bold or _italics) and not is_uri else '' _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}' # print(_result) @@ -681,23 +821,128 @@ def processDocuments(documents:list[str], elif style in docConfig.ul1: checkSameStyle(Style.unorderedlist, lambda:lines.append('')) if len(elem.text): # ignore empty + if (match := re.match(r'^(\s*\d+\.\s+)', text)): + # Numbered item: capture indentation length + lastListIndent['ul1'] = len(match.group(1)) + lines.append(text) + elif re.match(r'^\[[\w\.]+\]', text): + # Reference + lines.append('') + lines.append(text) + elif text.startswith(' '): + # Continuation: use captured indentation + lines.append('') + indent = lastListIndent.get('ul1', 4) + lines.append(' ' * indent + text.lstrip()) + elif text.lstrip().startswith(('-', '*')): + # Explicit bullet in text + lastListIndent['ul1'] = 4 + lines.append(f'- {text.lstrip("- *").strip()}') + elif 'bullet' in style or style.startswith('b'): + # Style implies bullet + lastListIndent['ul1'] = 4 lines.append(f'- {text}') + else: + # Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet) + lines.append('') + indent = lastListIndent.get('ul1', 4) + lines.append(' ' * indent + text.lstrip()) + elif style in docConfig.ul2: checkSameStyle(Style.unorderedlist2, lambda:lines.append('')) if len(elem.text): # ignore empty + if (match := re.match(r'^(\s*\d+\.\s+)', text)): + lastListIndent['ul2'] = len(match.group(1)) + lines.append(f'{" "*1}{text}') + elif re.match(r'^\[[\w\.]+\]', text): + lines.append('') + lines.append(f'{" "*1}{text}') + elif text.startswith(' '): + lines.append('') + indent = lastListIndent.get('ul2', 4) + lines.append(f'{" "*1}{" " * indent}{text.lstrip()}') + elif text.lstrip().startswith(('-', '*')): + lastListIndent['ul2'] = 4 + lines.append(f'{" "*1}- {text.lstrip("- *").strip()}') + elif 'bullet' in style or style.startswith('b'): + lastListIndent['ul2'] = 4 lines.append(f'{" "*1}- {text}') + else: + lines.append('') + indent = lastListIndent.get('ul2', 4) + lines.append(f'{" "*1}{" " * indent}{text.lstrip()}') + elif style in docConfig.ul3: checkSameStyle(Style.unorderedlist3, lambda:lines.append('')) if len(elem.text): # ignore empty + if (match := re.match(r'^(\s*\d+\.\s+)', text)): + lastListIndent['ul3'] = len(match.group(1)) + lines.append(f'{" "*2}{text}') + elif re.match(r'^\[[\w\.]+\]', text): + lines.append('') + lines.append(f'{" "*2}{text}') + elif text.startswith(' '): + lines.append('') + indent = lastListIndent.get('ul3', 4) + lines.append(f'{" "*2}{" " * indent}{text.lstrip()}') + elif text.lstrip().startswith(('-', '*')): + lastListIndent['ul3'] = 4 + lines.append(f'{" "*2}- {text.lstrip("- *").strip()}') + elif 'bullet' in style or style.startswith('b'): + lastListIndent['ul3'] = 4 lines.append(f'{" "*2}- {text}') + else: + lines.append('') + indent = lastListIndent.get('ul3', 4) + lines.append(f'{" "*2}{" " * indent}{text.lstrip()}') + elif style in docConfig.ul4: checkSameStyle(Style.unorderedlist4, lambda:lines.append('')) if len(elem.text): # ignore empty + if (match := re.match(r'^(\s*\d+\.\s+)', text)): + lastListIndent['ul4'] = len(match.group(1)) + lines.append(f'{" "*3}{text}') + elif re.match(r'^\[[\w\.]+\]', text): + lines.append('') + lines.append(f'{" "*3}{text}') + elif text.startswith(' '): + lines.append('') + indent = lastListIndent.get('ul4', 4) + lines.append(f'{" "*3}{" " * indent}{text.lstrip()}') + elif text.lstrip().startswith(('-', '*')): + lastListIndent['ul4'] = 4 + lines.append(f'{" "*3}- {text.lstrip("- *").strip()}') + elif 'bullet' in style or style.startswith('b'): + lastListIndent['ul4'] = 4 lines.append(f'{" "*3}- {text}') + else: + lines.append('') + indent = lastListIndent.get('ul4', 4) + lines.append(f'{" "*3}{" " * indent}{text.lstrip()}') + elif style in docConfig.ul5: checkSameStyle(Style.unorderedlist5, lambda:lines.append('')) if len(elem.text): # ignore empty + if (match := re.match(r'^(\s*\d+\.\s+)', text)): + lastListIndent['ul5'] = len(match.group(1)) + lines.append(f'{" "*4}{text}') + elif re.match(r'^\[[\w\.]+\]', text): + lines.append('') + lines.append(f'{" "*4}{text}') + elif text.startswith(' '): + lines.append('') + indent = lastListIndent.get('ul5', 4) + lines.append(f'{" "*4}{" " * indent}{text.lstrip()}') + elif text.lstrip().startswith(('-', '*')): + lastListIndent['ul5'] = 4 + lines.append(f'{" "*4}- {text.lstrip("- *").strip()}') + elif 'bullet' in style or style.startswith('b'): + lastListIndent['ul5'] = 4 lines.append(f'{" "*4}- {text}') + else: + lines.append('') + indent = lastListIndent.get('ul5', 4) + lines.append(f'{" "*4}{" " * indent}{text.lstrip()}') # Table Caption elif style in docConfig.tablecaption: @@ -739,6 +984,11 @@ def processDocuments(documents:list[str], # Example elif style in docConfig.example: checkSameStyle(Style.example, lambda:lines.append('')) + + # Special handling for references to ensure paragraph separation + if re.match(r'^\s*\[[\w\.]+\]', text): + lines.append('') + # Replace linebreaks for _t in text.split(_linebreak): lines.append(f'`{_t if _t else " "}` ') # at least an empty space. And 2 spaces at the end for newline @@ -883,12 +1133,71 @@ def processDocuments(documents:list[str], progress.update(processTask, advance = 1) # progress update for i in range(len(lines)): line = lines[i] + + # Remove empty formatting markers line = line.replace('__', '') line = line.replace('****', '') line = line.replace('** ', '** ') line = line.replace('_ ', '_ ') - line = line.replace('** **', ' ') - #line = line.replace(' ', ' ') + + # Merge consecutive bold markers: **text1****text2** -> **text1 text2** + # This handles cases where formatting was split across runs + # Pattern: **text1****text2** (four asterisks between text) + line = re.sub(r'\*\*([^*]+)\*\*\*\*([^*]+)\*\*', r'**\1 \2**', line) + # Pattern: **text1**** (trailing four asterisks) + line = re.sub(r'\*\*([^*]+)\*\*\*\*(\s|$)', r'**\1**\2', line) + # Pattern: ****text1** (leading four asterisks) + line = re.sub(r'(\s|^)\*\*\*\*([^*]+)\*\*', r'\1**\2**', line) + # Pattern: **text** (trailing double asterisks not part of a pair) + # Remove trailing ** that's not part of a valid bold pair + line = re.sub(r'\*\*([^*\s]+)\*\*(\*\*)(\s|$)', r'**\1**\3', line) + + # Remove any remaining sequences of 4+ asterisks (shouldn't happen after merge, but safety) + line = re.sub(r'\*{4,}', '**', line) + + lines[i] = line + + # + # Post-process to convert bold URIs and API references to inline code + # This catches any cases that weren't handled during initial parsing + # + progress.update(processTask, advance = 1) # progress update + in_code_block = False + for i in range(len(lines)): + line = lines[i] + stripped = line.strip() + + # Track code block state + if stripped.startswith('```'): + in_code_block = not in_code_block + continue + + # Skip processing if inside a code block + if in_code_block: + continue + + # Pattern 1: Bold URIs - **{apiRoot}/path/to/resource** -> `{apiRoot}/path/to/resource` + # Matches URIs with slashes and optional curly braces + line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+\})\*\*', r'`\1`', line) + line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+)\*\*', r'`\1`', line) + + # Pattern 2: API references like VnfInstances.Post.201 + line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+)\*\*', r'`\1`', line) + + # Pattern 3: API component names like VnfInstancesPostRequest + line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response))\*\*', r'`\1`', line) + + # Pattern 4: API references like VnfInstances.Post (without status code) + line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options))\*\*', r'`\1`', line) + + # Pattern 5: Paths ending with ** (e.g., /path/to/resource**:) + # This handles cases where the colon is outside the bold markers + line = re.sub(r'(\S+)\*\*:', r'\1:', line) + + # Pattern 6: References with ** in the middle (e.g., VnfInstances**.Post.201) + line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*\.', r'\1.', line) + line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*([A-Z])', r'\1\2', line) + lines[i] = line -- GitLab From 5c2c277163b98d79cb48b34b59ffb3a3c55b83fc Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega Date: Thu, 18 Dec 2025 13:38:46 +0100 Subject: [PATCH 2/2] Fix --- spec2md.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec2md.py b/spec2md.py index 0d8da69..7d7536c 100644 --- a/spec2md.py +++ b/spec2md.py @@ -841,7 +841,7 @@ def processDocuments(documents:list[str], elif 'bullet' in style or style.startswith('b'): # Style implies bullet lastListIndent['ul1'] = 4 - lines.append(f'- {text}') + lines.append(f'- {text}') else: # Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet) lines.append('') @@ -866,7 +866,7 @@ def processDocuments(documents:list[str], lines.append(f'{" "*1}- {text.lstrip("- *").strip()}') elif 'bullet' in style or style.startswith('b'): lastListIndent['ul2'] = 4 - lines.append(f'{" "*1}- {text}') + lines.append(f'{" "*1}- {text}') else: lines.append('') indent = lastListIndent.get('ul2', 4) @@ -890,7 +890,7 @@ def processDocuments(documents:list[str], lines.append(f'{" "*2}- {text.lstrip("- *").strip()}') elif 'bullet' in style or style.startswith('b'): lastListIndent['ul3'] = 4 - lines.append(f'{" "*2}- {text}') + lines.append(f'{" "*2}- {text}') else: lines.append('') indent = lastListIndent.get('ul3', 4) @@ -914,7 +914,7 @@ def processDocuments(documents:list[str], lines.append(f'{" "*3}- {text.lstrip("- *").strip()}') elif 'bullet' in style or style.startswith('b'): lastListIndent['ul4'] = 4 - lines.append(f'{" "*3}- {text}') + lines.append(f'{" "*3}- {text}') else: lines.append('') indent = lastListIndent.get('ul4', 4) @@ -938,7 +938,7 @@ def processDocuments(documents:list[str], lines.append(f'{" "*4}- {text.lstrip("- *").strip()}') elif 'bullet' in style or style.startswith('b'): lastListIndent['ul5'] = 4 - lines.append(f'{" "*4}- {text}') + lines.append(f'{" "*4}- {text}') else: lines.append('') indent = lastListIndent.get('ul5', 4) -- GitLab