Commit 757d182c authored by Naum Spaseski's avatar Naum Spaseski Committed by Miguel Angel Reina Ortega
Browse files

URI and other corrections, mostly inspired from NFV docs

parent c7254b09
Loading
Loading
Loading
Loading
Loading
+318 −9
Original line number Diff line number Diff line
@@ -153,7 +153,8 @@ class DocumentConfiguration(object):
		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)

		self.pandocTableWarnings = config.getboolean('general', 'pandocTableWarnings', fallback = True)
		self.escapeQuotations = config.getboolean('general', 'escapeQuotations', fallback = False)
		#	Paragraphs
		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
							for c in config['paragraphs'] }
@@ -248,6 +249,7 @@ def processDocuments(documents:list[str],
	headers:list[Tuple[int, str]]									= []
	emfFiles:list[str]												= []
	referencedImages:list[str]										= []
	lastListIndent:Dict[str, int]									= {}
	footnotes:dict[str, str]										= {}

	global _print
@@ -356,6 +358,50 @@ def processDocuments(documents:list[str],
			)
			newParagraphs = 0

			def _isURILike(text:str) -> bool:
				"""	Check if text appears to be a URI or code-like structure.
				"""
				if not text:
					return False
				# Check for URI patterns: contains slashes, curly braces, or common URI components
				uri_indicators = ['/', '{', '}', 'apiRoot', 'apiMajorVersion', 'http', '://']
				# Also check for variable patterns like {variableName}
				has_curly_braces = '{' in text and '}' in text
				has_slashes = '/' in text
				has_uri_keywords = any(indicator.lower() in text.lower() for indicator in uri_indicators)
				return has_curly_braces or (has_slashes and has_uri_keywords)

			def _isAPIReference(text:str) -> bool:
				"""	Check if text appears to be an API reference (e.g., VnfInstances.Post.201, VnfInstancesPostRequest).
				"""
				if not text:
					return False
				# Pattern: ComponentName.Method.StatusCode (e.g., VnfInstances.Post.201)
				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+$', text.strip()):
					return True
				# Pattern: ComponentNameMethodRequest/Response (e.g., VnfInstancesPostRequest)
				if re.match(r'^[A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response)$', text.strip()):
					return True
				# Pattern: ComponentName with method (e.g., VnfInstances.Post)
				if re.match(r'^[A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)$', text.strip()):
					return True
				return False

			def _getRunFormatting(runElement:ET.Element) -> Tuple[bool, bool]:
				"""	Extract bold and italic formatting from a run element.
				"""
				bold = False
				italic = False
				for e in runElement:
					if strippedTag(e.tag) == 'rPr':
						for ep in e:
							match strippedTag(ep.tag):
								case 'b' if ep.attrib.get(_val, 'true') == 'true':
									bold = True
								case 'i' if ep.attrib.get(_val, 'true') == 'true':
									italic = True
				return (bold, italic)

			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
				"""	Recursively parse a document paragraph.
				"""
@@ -365,12 +411,101 @@ def processDocuments(documents:list[str],
				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
				match tag:
					case 'p':
						for x in element:
							_result += _parseXML(x, inCell)
						# Collect all runs first to merge consecutive runs with same formatting
						current_run_text = ''
						current_bold = False
						current_italic = False
						
						def _outputCurrentRun() -> str:
							"""	Output the current accumulated run with proper formatting.
							"""
							nonlocal current_run_text, current_bold, current_italic
							if not current_run_text:
								return ''
							
							# Determine if this looks like a URI or API reference
							is_uri = _isURILike(current_run_text)
							is_api_ref = _isAPIReference(current_run_text)
							
							# Process text
							_s = str(toMD(current_run_text))
							# Don't strip whitespace for URIs or if not bold/italic
							if not is_uri and (current_bold or current_italic):
								_s = _s.strip()
							
							# For URIs and API references that are bold, use inline code instead
							if (is_uri or is_api_ref) and current_bold:
								# Use inline code instead of bold for URIs and API references
								# Replace single * or _ (but not needed in code blocks)
								_s = _s.replace('`', '\\`')  # Escape backticks
								result = f'`{_s}` '
							else:
								# Replace single * or _
								_s = _s.replace('_', '\\_')
								_s = _s.replace('*', '\\*')
								if docConfig.escapeQuotations:
									_s = _s.replace('"', '\\"') # Escape double quotes
								
								# Apply formatting
								_bold_marker = '**' if current_bold else ''
								_italic_marker = '_' if current_italic else ''
								
								# Add trailing space only if formatting is applied and not URI-like
								_postfix = ' ' if (current_bold or current_italic) and not is_uri else ''
								result = f'{_bold_marker}{_italic_marker}{_s}{_italic_marker}{_bold_marker}{_postfix}'
							
							# Reset current run
							current_run_text = ''
							current_bold = False
							current_italic = False
							
							return result
						
						for child in element:
							child_tag = strippedTag(child.tag)
							if child_tag == 'r':
								# Get formatting for this run
								run_bold, run_italic = _getRunFormatting(child)
								# Extract text from this run
								run_text = ''
								for text_elem in child:
									if strippedTag(text_elem.tag) == 't':
										run_text += str(text_elem.text) if text_elem.text else ''
									elif strippedTag(text_elem.tag) == 'br':
										run_text += _linebreak
									elif strippedTag(text_elem.tag) == 'tab':
										run_text += '    '
									# Handle other inline elements recursively
									else:
										run_text += _parseXML(text_elem, inCell)
								
								# Merge with previous run if formatting matches
								if run_text and (run_bold == current_bold and run_italic == current_italic and current_run_text):
									current_run_text += run_text
								else:
									# Output previous run if any
									if current_run_text:
										_result += _outputCurrentRun()
									# Start new run
									current_run_text = run_text
									current_bold = run_bold
									current_italic = run_italic
							else:
								# Non-run element - output current run first, then process the element
								if current_run_text:
									_result += _outputCurrentRun()
								_result += _parseXML(child, inCell)
						
						# Output last run if any
						if current_run_text:
							_result += _outputCurrentRun()
					
					case 'r':
						# For runs not in paragraphs (shouldn't happen normally, but handle gracefully)
						for x in element:
							_result += _parseXML(x, inCell)
					case 't':
						# Fallback for text elements not processed in paragraph merge
						_bold = ''
						_italics = ''
						for e in element.getparent():
@@ -384,13 +519,18 @@ def processDocuments(documents:list[str],
										# case _:
										# 	_print(f'[yellow]unsupported style: {ep.tag}')
						
						# Strip white spaces if bold or italics
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
						_text = str(toMD(str(element.text))) if element.text else ''
						is_uri = _isURILike(_text)
						
						# Strip white spaces if bold or italics, but not for URIs
						_s = _text.strip() if (_bold or _italics) and not is_uri else _text
						# Replace single * or _
						_s = _s.replace('_', '\\_')
						_s = _s.replace('*', '\\*')
						# Add trailing white space when bold or italics
						_postfix = ' ' if _bold or _italics else ''
						if docConfig.escapeQuotations:
							_s = _s.replace('"', '\\"') # Escape double quotes
						# Add trailing white space when bold or italics, but not for URIs
						_postfix = ' ' if (_bold or _italics) and not is_uri else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
						# print(_result)

@@ -681,23 +821,128 @@ def processDocuments(documents:list[str],
						elif style in docConfig.ul1:
							checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									# Numbered item: capture indentation length
									lastListIndent['ul1'] = len(match.group(1))
									lines.append(text)
								elif re.match(r'^\[[\w\.]+\]', text):
									# Reference
									lines.append('')
									lines.append(text)
								elif text.startswith(' '):
									# Continuation: use captured indentation
									lines.append('')
									indent = lastListIndent.get('ul1', 4)
									lines.append(' ' * indent + text.lstrip())
								elif text.lstrip().startswith(('-', '*')):
									# Explicit bullet in text
									lastListIndent['ul1'] = 4
									lines.append(f'- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									# Style implies bullet
									lastListIndent['ul1'] = 4
								lines.append(f'- {text}')
								else:
									# Fallback: treat as continuation paragraph (e.g. List Paragraph without number/bullet)
									lines.append('')
									indent = lastListIndent.get('ul1', 4)
									lines.append(' ' * indent + text.lstrip())

						elif style in docConfig.ul2:
							checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul2'] = len(match.group(1))
									lines.append(f'{"    "*1}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*1}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul2', 4)
									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul2'] = 4
									lines.append(f'{"    "*1}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul2'] = 4
								lines.append(f'{"    "*1}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul2', 4)
									lines.append(f'{"    "*1}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul3:
							checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul3'] = len(match.group(1))
									lines.append(f'{"    "*2}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*2}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul3', 4)
									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul3'] = 4
									lines.append(f'{"    "*2}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul3'] = 4
								lines.append(f'{"    "*2}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul3', 4)
									lines.append(f'{"    "*2}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul4:
							checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul4'] = len(match.group(1))
									lines.append(f'{"    "*3}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*3}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul4', 4)
									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul4'] = 4
									lines.append(f'{"    "*3}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul4'] = 4
								lines.append(f'{"    "*3}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul4', 4)
									lines.append(f'{"    "*3}{" " * indent}{text.lstrip()}')

						elif style in docConfig.ul5:
							checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								if (match := re.match(r'^(\s*\d+\.\s+)', text)):
									lastListIndent['ul5'] = len(match.group(1))
									lines.append(f'{"    "*4}{text}')
								elif re.match(r'^\[[\w\.]+\]', text):
									lines.append('')
									lines.append(f'{"    "*4}{text}')
								elif text.startswith(' '):
									lines.append('')
									indent = lastListIndent.get('ul5', 4)
									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')
								elif text.lstrip().startswith(('-', '*')):
									lastListIndent['ul5'] = 4
									lines.append(f'{"    "*4}- {text.lstrip("- *").strip()}')
								elif 'bullet' in style or style.startswith('b'):
									lastListIndent['ul5'] = 4
								lines.append(f'{"    "*4}- {text}')
								else:
									lines.append('')
									indent = lastListIndent.get('ul5', 4)
									lines.append(f'{"    "*4}{" " * indent}{text.lstrip()}')

						#	Table Caption
						elif style in docConfig.tablecaption:
@@ -739,6 +984,11 @@ def processDocuments(documents:list[str],
						#	Example
						elif style in docConfig.example:
							checkSameStyle(Style.example, lambda:lines.append(''))

							# Special handling for references to ensure paragraph separation
							if re.match(r'^\s*\[[\w\.]+\]', text):
								lines.append('')

							# Replace linebreaks
							for _t in text.split(_linebreak):
								lines.append(f'`{_t if _t else " "}`  ') # at least an empty space. And 2 spaces at the end for newline
@@ -883,12 +1133,71 @@ def processDocuments(documents:list[str],
			progress.update(processTask, advance = 1)	# progress update
			for i in range(len(lines)):
				line = lines[i]
				
				# Remove empty formatting markers
				line = line.replace('__', '')
				line = line.replace('****', '')
				line = line.replace('**  ', '** ')
				line = line.replace('_  ', '_ ')
				line = line.replace('** **', ' ')
				#line = line.replace('  ', ' ')
				
				# Merge consecutive bold markers: **text1****text2** -> **text1 text2**
				# This handles cases where formatting was split across runs
				# Pattern: **text1****text2** (four asterisks between text)
				line = re.sub(r'\*\*([^*]+)\*\*\*\*([^*]+)\*\*', r'**\1 \2**', line)
				# Pattern: **text1**** (trailing four asterisks)
				line = re.sub(r'\*\*([^*]+)\*\*\*\*(\s|$)', r'**\1**\2', line)
				# Pattern: ****text1** (leading four asterisks)
				line = re.sub(r'(\s|^)\*\*\*\*([^*]+)\*\*', r'\1**\2**', line)
				# Pattern: **text** (trailing double asterisks not part of a pair)
				# Remove trailing ** that's not part of a valid bold pair
				line = re.sub(r'\*\*([^*\s]+)\*\*(\*\*)(\s|$)', r'**\1**\3', line)
				
				# Remove any remaining sequences of 4+ asterisks (shouldn't happen after merge, but safety)
				line = re.sub(r'\*{4,}', '**', line)
				
				lines[i] = line

			#
			#	Post-process to convert bold URIs and API references to inline code
			#	This catches any cases that weren't handled during initial parsing
			#
			progress.update(processTask, advance = 1)	# progress update
			in_code_block = False
			for i in range(len(lines)):
				line = lines[i]
				stripped = line.strip()

				# Track code block state
				if stripped.startswith('```'):
					in_code_block = not in_code_block
					continue
				
				# Skip processing if inside a code block
				if in_code_block:
					continue
				
				# Pattern 1: Bold URIs - **{apiRoot}/path/to/resource** -> `{apiRoot}/path/to/resource`
				# Matches URIs with slashes and optional curly braces
				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+\})\*\*', r'`\1`', line)
				line = re.sub(r'\*\*(\s*\{?[a-zA-Z0-9_]+\}?/[^*]+)\*\*', r'`\1`', line)
				
				# Pattern 2: API references like VnfInstances.Post.201
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options)\.\d+)\*\*', r'`\1`', line)
				
				# Pattern 3: API component names like VnfInstancesPostRequest
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+(Post|Get|Put|Delete|Patch|Head|Options)(Request|Response))\*\*', r'`\1`', line)
				
				# Pattern 4: API references like VnfInstances.Post (without status code)
				line = re.sub(r'\*\*([A-Z][a-zA-Z0-9]+\.(Post|Get|Put|Delete|Patch|Head|Options))\*\*', r'`\1`', line)
				
				# Pattern 5: Paths ending with ** (e.g., /path/to/resource**:)
				# This handles cases where the colon is outside the bold markers
				line = re.sub(r'(\S+)\*\*:', r'\1:', line)
				
				# Pattern 6: References with ** in the middle (e.g., VnfInstances**.Post.201)
				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*\.', r'\1.', line)
				line = re.sub(r'([A-Z][a-zA-Z0-9]+)\*\*([A-Z])', r'\1\2', line)
				
				lines[i] = line