Commit 212ec405 authored by Naum Spaseski's avatar Naum Spaseski
Browse files

URI and other corrections, mostly inspired from NFV docs

parent 01d67db5
Loading
Loading
Loading
Loading
+353 −16
Original line number Diff line number Diff line
@@ -154,7 +154,8 @@ class DocumentConfiguration(object):
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)
		self.frontmatterHeading = config.get('general', 'frontmatterHeading', fallback = None)
		
		self.pandocTableWarnings = config.getboolean('general', 'pandocTableWarnings', fallback = True)
		self.escapeQuotations = config.getboolean('general', 'escapeQuotations', fallback = False)
		# Frontmatter - can be specified as multiline string or file path
		# Note: ConfigParser supports multiline values if subsequent lines are indented
		frontmatterConfig = config.get('general', 'frontmatter', fallback = None)
@@ -272,6 +273,7 @@ def processDocuments(documents:list[str],
	headers:list[Tuple[int, str]]									= []
	emfFiles:list[str]												= []
	referencedImages:list[str]										= []
	lastListIndent:Dict[str, int]									= {}
	footnotes:dict[str, str]										= {}

	global _print
@@ -380,6 +382,50 @@ def processDocuments(documents:list[str],
			)
			newParagraphs = 0

			def _isURILike(text:str) -> bool:
				"""	Check if text appears to be a URI or code-like structure.
				"""
				if not text:
					return False
				# Check for URI patterns: contains slashes, curly braces, or common URI components
				uri_indicators = ['/', '{', '}', 'apiRoot', 'apiMajorVersion', 'http', '://']
				# Also check for variable patterns like {variableName}
				has_curly_braces = '{' in text and '}' in text
				has_slashes = '/' in text
				has_uri_keywords = any(indicator.lower() in text.lower() for indicator in uri_indicators)
				return has_curly_braces or (has_slashes and has_uri_keywords)

			def _isAPIReference(text:str) -> bool:
				"""	Check if text appears to be an API reference (e.g., VnfInstances.Post.201, VnfInstancesPostRequest).
				"""
				if not text:
					return False
				# Pattern: ComponentName.Method.StatusCode (e.g., VnfInstances.Post.201)
				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+$', text.strip()):
					return True
				# Pattern: ComponentNameMethodRequest/Response (e.g., VnfInstancesPostRequest)
				if re.match(r'^[A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response)$', text.strip()):
					return True
				# Pattern: ComponentName with method (e.g., VnfInstances.Post)
				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)$', text.strip()):
					return True
				return False

			def _getRunFormatting(runElement:ET.Element) -> Tuple[bool, bool]:
				"""	Extract bold and italic formatting from a run element.
				"""
				bold = False
				italic = False
				for e in runElement:
					if strippedTag(e.tag) == 'rPr':
						for ep in e:
							match strippedTag(ep.tag):
								case 'b' if ep.attrib.get(_val, 'true') == 'true':
									bold = True
								case 'i' if ep.attrib.get(_val, 'true') == 'true':
									italic = True
				return (bold, italic)

			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
				"""	Recursively parse a document paragraph.
				"""
@@ -389,12 +435,101 @@ def processDocuments(documents:list[str],
				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
				match tag:
					case 'p':
						for x in element:
							_result += _parseXML(x, inCell)
						# Collect all runs first to merge consecutive runs with same formatting
						current_run_text = ''
						current_bold = False
						current_italic = False
						
						def _outputCurrentRun() -> str:
							"""	Output the current accumulated run with proper formatting.
							"""
							nonlocal current_run_text, current_bold, current_italic
							if not current_run_text:
								return ''
							
							# Determine if this looks like a URI or API reference
							is_uri = _isURILike(current_run_text)
							is_api_ref = _isAPIReference(current_run_text)
							
							# Process text
							_s = str(toMD(current_run_text))
							# Don't strip whitespace for URIs or if not bold/italic
							if not is_uri and (current_bold or current_italic):
								_s = _s.strip()
							
							# For URIs and API references that are bold, use inline code instead
							if (is_uri or is_api_ref) and current_bold:
								# Use inline code instead of bold for URIs and API references
								# Replace single * or _ (but not needed in code blocks)
								_s = _s.replace('`', '\\`')  # Escape backticks
								result = f'`{_s}` '
							else:
								# Replace single * or _
								_s = _s.replace('_', '\\_')
								_s = _s.replace('*', '\\*')
								if docConfig.escapeQuotations:
									_s = _s.replace('"', '\\"') # Escape double quotes
								
								# Apply formatting
								_bold_marker = '**' if current_bold else ''
								_italic_marker = '_' if current_italic else ''
								
								# Add trailing space only if formatting is applied and not URI-like
								_postfix = ' ' if (current_bold or current_italic) and not is_uri else ''
								result = f'{_bold_marker}{_italic_marker}{_s}{_italic_marker}{_bold_marker}{_postfix}'
							
							# Reset current run
							current_run_text = ''
							current_bold = False
							current_italic = False
							
							return result
						
						for child in element:
							child_tag = strippedTag(child.tag)
							if child_tag == 'r':
								# Get formatting for this run
								run_bold, run_italic = _getRunFormatting(child)
								# Extract text from this run
								run_text = ''
								for text_elem in child:
									if strippedTag(text_elem.tag) == 't':
										run_text += str(text_elem.text) if text_elem.text else ''
									elif strippedTag(text_elem.tag) == 'br':
										run_text += _linebreak
									elif strippedTag(text_elem.tag) == 'tab':
										run_text += '    '
									# Handle other inline elements recursively
									else:
										run_text += _parseXML(text_elem, inCell)
								
								# Merge with previous run if formatting matches
								if run_text and (run_bold == current_bold and run_italic == current_italic and current_run_text):
									current_run_text += run_text
								else:
									# Output previous run if any
									if current_run_text:
										_result += _outputCurrentRun()
									# Start new run
									current_run_text = run_text
									current_bold = run_bold
									current_italic = run_italic
							else:
								# Non-run element - output current run first, then process the element
								if current_run_text:
									_result += _outputCurrentRun()
								_result += _parseXML(child, inCell)
						
						# Output last run if any
						if current_run_text:
							_result += _outputCurrentRun()
					
					case 'r':
						# For runs not in paragraphs (shouldn't happen normally, but handle gracefully)
						for x in element:
							_result += _parseXML(x, inCell)
					case 't':
						# Fallback for text elements not processed in paragraph merge
						_bold = ''
						_italics = ''
						for e in element.getparent():
@@ -408,13 +543,18 @@ def processDocuments(documents:list[str],
										# case _:
										# 	_print(f'[yellow]unsupported style: {ep.tag}')
						
						# Strip white spaces if bold or italics
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
						_text = str(toMD(str(element.text))) if element.text else ''
						is_uri = _isURILike(_text)
						
						# Strip white spaces if bold or italics, but not for URIs
						_s = _text.strip() if (_bold or _italics) and not is_uri else _text
						# Replace single * or _
						_s = _s.replace('_', '\\_')
						_s = _s.replace('*', '\\*')
						# Add trailing white space when bold or italics
						_postfix = ' ' if _bold or _italics else ''
						if docConfig.escapeQuotations:
							_s = _s.replace('"', '\\"') # Escape double quotes
						# Add trailing white space when bold or italics, but not for URIs
						_postfix = ' ' if (_bold or _italics) and not is_uri else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
						# print(_result)

@@ -705,23 +845,128 @@ def processDocuments(documents:list[str],
						elif style in docConfig.ul1:
							checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									# Numbered item: capture indentation length
									lastListIndent['ul1'] = len(match.group(1))
									lines.append(text)
								elif re.match(r'^\[[\w\.]+\]', text):
									# Reference
									lines.append('')
									lines.append(text)
								elif text.startswith(' '):
									# Continuation: use captured indentation
									lines.append('')
									indent = lastListIndent.get('ul1', 4)
									lines.append(' ' * indent + text.lstrip())
								elif text.lstrip().startswith(('-', '*')):
									# Explicit bullet in text
									lastListIndent['ul1'] = 4
									lines.append(f'- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									# Style implies bullet
									lastListIndent['ul1'] = 4
									lines.append(f'- {text}')
								else:
									# Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet)
									lines.append('')
									indent = lastListIndent.get('ul1', 4)
									lines.append(' ' * indent + text.lstrip())

						elif style in docConfig.ul2:
							checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul2'] = len(match.group(1))
									lines.append(f'{"    "*1}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*1}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul2', 4)
									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul2'] = 4
									lines.append(f'{"    "*1}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul2'] = 4
									lines.append(f'{"    "*1}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul2', 4)
									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul3:
							checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul3'] = len(match.group(1))
									lines.append(f'{"    "*2}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*2}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul3', 4)
									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul3'] = 4
									lines.append(f'{"    "*2}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul3'] = 4
									lines.append(f'{"    "*2}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul3', 4)
									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul4:
							checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul4'] = len(match.group(1))
									lines.append(f'{"    "*3}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*3}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul4', 4)
									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul4'] = 4
									lines.append(f'{"    "*3}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul4'] = 4
									lines.append(f'{"    "*3}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul4', 4)
									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul5:
							checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul5'] = len(match.group(1))
									lines.append(f'{"    "*4}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*4}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul5', 4)
									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul5'] = 4
									lines.append(f'{"    "*4}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul5'] = 4
									lines.append(f'{"    "*4}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul5', 4)
									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')

						#	Table Caption
						elif style in docConfig.tablecaption:
@@ -763,6 +1008,11 @@ def processDocuments(documents:list[str],
						#	Example
						elif style in docConfig.example:
							checkSameStyle(Style.example, lambda:lines.append(''))

							# Special handling for references to ensure paragraph separation
							if re.match(r'^\s*\[[\w\.]+\]', text):
								lines.append('')

							# Replace linebreaks
							for _t in text.split(_linebreak):
								lines.append(f'`{_t if _t else " "}`  ') # at least an empty space. And 2 spaces at the end for newline
@@ -874,6 +1124,7 @@ def processDocuments(documents:list[str],
							spanType = "colspans" if colSpanDetected else ("rowspans" if rowSpanDetected else "merged cells")
							if colSpanDetected and rowSpanDetected:
								spanType = "colspans and rowspans"
							if docConfig.pandocTableWarnings:
								lines.append(f'<mark>Table with {spanType} converted to grid table. Please check and adjust manually if necessary.</mark>')
							tableLines = markdownToGrid(tableLines)
						
@@ -913,12 +1164,71 @@ def processDocuments(documents:list[str],
			progress.update(processTask, advance = 1)	# progress update
			for i in range(len(lines)):
				line = lines[i]
				
				# Remove empty formatting markers
				line = line.replace('__', '')
				line = line.replace('** **', ' ')
				line = line.replace('**  ', '** ')
				line = line.replace('_  ', '_ ')
				line = line.replace('** **', ' ')
				#line = line.replace('  ', ' ')
				
				# Merge consecutive bold markers: **text1****text2** -> **text1 text2**
				# This handles cases where formatting was split across runs
				# Pattern: **text1****text2** (four asterisks between text)
				line = re.sub(r'\*\*([^*]+)\*\*\*\*([^*]+)\*\*', r'**\1 \2**', line)
				# Pattern: **text1**** (trailing four asterisks)
				line = re.sub(r'\*\*([^*]+)\*\*\*\*(\s|$)', r'**\1**\2', line)
				# Pattern: ****text1** (leading four asterisks)
				line = re.sub(r'(\s|^)\*\*\*\*([^*]+)\*\*', r'\1**\2**', line)
				# Pattern: **text** (trailing double asterisks not part of a pair)
				# Remove trailing ** that's not part of a valid bold pair
				line = re.sub(r'\*\*([^*\s]+)\*\*(\*\*)(\s|$)', r'**\1**\3', line)
				
				# Remove any remaining sequences of 4+ asterisks (shouldn't happen after merge, but safety)
				line = re.sub(r'\*{4,}', '**', line)
				
				lines[i] = line

			#
			#	Post-process to convert bold URIs and API references to inline code
			#	This catches any cases that weren't handled during initial parsing
			#
			progress.update(processTask, advance = 1)	# progress update
			in_code_block = False
			for i in range(len(lines)):
				line = lines[i]
				stripped = line.strip()
				
				# Track code block state
				if stripped.startswith('```'):
					in_code_block = not in_code_block
					continue
				
				# Skip processing if inside a code block
				if in_code_block:
					continue
				
				# Pattern 1: Bold URIs - **{apiRoot}/path/to/resource** -> `{apiRoot}/path/to/resource`
				# Matches URIs with slashes and optional curly braces
				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+\})\*\*', r'`\1`', line)
				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+)\*\*', r'`\1`', line)
				
				# Pattern 2: API references like VnfInstances.Post.201
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+)\*\*', r'`\1`', line)
				
				# Pattern 3: API component names like VnfInstancesPostRequest
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response))\*\*', r'`\1`', line)
				
				# Pattern 4: API references like VnfInstances.Post (without status code)
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options))\*\*', r'`\1`', line)
				
				# Pattern 5: Paths ending with ** (e.g., /path/to/resource**:)
				# This handles cases where the colon is outside the bold markers
				line = re.sub(r'(\S+)\*\*:', r'\1:', line)
				
				# Pattern 6: References with ** in the middle (e.g., VnfInstances**.Post.201)
				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*\.', r'\1.', line)
				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*([A-Z])', r'\1\2', line)
				
				lines[i] = line


@@ -1083,6 +1393,33 @@ def processDocuments(documents:list[str],
					frontmatterLines = docConfig.frontmatter.split('\n')
					lines = frontmatterLines + [''] + lines
			
			#
			#	Handle line breaks in descriptions (convert <br /> to spaces/newlines)
			#	Keep <br /> in table cells, but convert in regular text
			#
			in_table = False
			for i in range(len(lines)):
				line = lines[i]
				stripped = line.strip()
				
				# Detect table context
				if stripped.startswith(('|', '+')) and not stripped.startswith('```'):
					in_table = True
				elif stripped and not stripped.startswith(('|', '+')):
					in_table = False
				
				# Skip processing if in code blocks or tables
				if stripped.startswith('```') or in_table:
					continue
				
				# Convert <br /> to spaces in descriptions (for OpenAPI compatibility)
				# Replace <br /> with a space, but preserve multiple <br /> as single space
				if '<br />' in line:
					# Replace <br /> with space, but collapse multiple spaces
					line = re.sub(r'<br\s*/?>', ' ', line)
					line = re.sub(r'\s+', ' ', line)  # Collapse multiple spaces
					lines[i] = line

			#
			#	Clean up redundant spaces and normalize blank lines
			#