Commit 96ab91ee authored by Serafino's avatar Serafino
Browse files

updated to work with pandoc chunked HTML option (Spezzatino)

parent 80913c53
Loading
Loading
Loading
Loading
+21 −3
Original line number Diff line number Diff line
--THIS LUA FILTER MUST BE APPLIED TO PANDOC IN THE CORRECT ORDER


--global table for references link
references = {}
annex_headers = {}
@@ -145,6 +146,13 @@ if FORMAT:match 'html' then
        -- fix annex headers
        if el.level == 8 then
            annex_headers[el.content[3].text] =  "#"..el.attr.identifier -- annex letter / link correspondance
            el.level = 1
            return el
        end
        -- fix header level of sub-clauses of annexes 
        if el.content[1].t == "Str" and el.content[1].text:find("^%u%.%d") then
            el.level = el.level + 1
            return el
        end
        --search for style
        return Style(el) 
@@ -152,6 +160,13 @@ if FORMAT:match 'html' then


    function Image(el)
        -- retrieve aspect ratio of image
        local width = tonumber(el.attr.attributes.width:sub(1,-3))
        local height = tonumber(el.attr.attributes.height:sub(1,-3))
        local ratio = height/width
        -- set height
        el.attr = { style = "width: 100%; height: calc(100%*"..ratio..");"}
        
        local filePath, extension = el.src:match("(.*)%.(.*)$") -- image.png, jpeg or emf
        --fixes extensions
        if extension == "emf" then 
@@ -160,10 +175,13 @@ if FORMAT:match 'html' then
        end
        --adds yellow overlay on top of the image
        if extension == "png" then
            local overlay = pandoc.Span({}, { style = "position: absolute; top: 0; right: 0; bottom: 0; left: 0; background-color: rgba(255, 255, 0, 0.5); pointer-events: none; z-index: 1;", class = "image_overlay"})
            el.attr = { style = "width: 100%; height: auto; position: relative;"}
            local overlay = pandoc.Span({}, { 
                style = "position: absolute; top: 0; right: 0; bottom: 0; left: 0; background-color: rgba(255, 255, 0, 0.5); pointer-events: none; z-index: 1;",
                class = "image_overlay"
            })
            return pandoc.Span({el, overlay}, { style = "position: relative; display: inline-flex;" })
        end
        return el;
    end

    function Pandoc(el)
+1 −1
Original line number Diff line number Diff line
@@ -238,7 +238,7 @@ if FORMAT:match 'html' then

    function Emph(el)
        el = Linking(el)
        return el        
        return pandoc.RawInline('html', '<i>' .. pandoc.utils.stringify(el) .. '</i>')      
    end

    function Underline(el)
+38 −28
Original line number Diff line number Diff line
import os
import re
import sys
from dataclasses import dataclass
@@ -9,7 +10,7 @@ from docx.api import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Emu, Pt, RGBColor

DEBUG = False #print debug info switch
DEBUG = True #print debug info switch


@dataclass
@@ -199,7 +200,7 @@ def insert_missing_space(para, inline_text):
    return ' '+inline_text



#use new document to try word diff
doc = Document("ETSI_GS_skeleton.docx")

cssrules = {} #collection of parsed css rules as Style objects
@@ -218,6 +219,7 @@ list_level = 0 # used to resolve multiple nested lists
def handle_tag(HTML_tag : bs4.element.Tag):
    global para, run, cell, style, list_level, para_style, prev_inline, curr_inline

    ##THIS HANDLES TEXT LITERALS
    if isinstance(HTML_tag, bs4.element.NavigableString):
        if DEBUG:
            print(HTML_tag)
@@ -239,16 +241,16 @@ def handle_tag(HTML_tag : bs4.element.Tag):
        return

    curr_inline = None     
    #resolve blockquotes
    ##THIS HANDLES BLOCKQUOTES
    if HTML_tag.name == "blockquote":
        for child in HTML_tag.children:
            handle_tag(child)

    #resolve table
    ##THIS HANDLES TABLES
    elif HTML_tag.name == "table":
        handle_table(HTML_tag)
    
    #resolve header
    ##THIS HANDLES HEADERS
    elif header := re.match(r"h(\d)", HTML_tag.name):
        if  HTML_tag.string:
            heading = doc.add_paragraph(style= "Heading"+header.group(1))
@@ -262,7 +264,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
            if DEBUG:
                print("ERROR: header string is not available", list(HTML_tag.children))

    #resolve lists
    ##THIS HANDLES LISTS
    elif HTML_tag.name in ["ol", "ul"]:
        list_level += 1 #start of the list
        
@@ -276,7 +278,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
        list_level -= 1 #end of the list
    

    #start a new paragraph
    ##THIS HANDLES DIVS AND PARAGRAPHS (BLOCKS)
    elif HTML_tag.name in blocks: 

        #reset default
@@ -337,7 +339,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):
            handle_tag(child)


    #handle links
    ##THIS HANDLES LINKS
    elif HTML_tag.name == "a":
        old_run = None
        if run: #we wont write here, but we want to save the style info, if present
@@ -367,7 +369,7 @@ def handle_tag(HTML_tag : bs4.element.Tag):



    # add a new run
    ##THIS HANDLES INLINES
    elif HTML_tag.name in inlines:
        curr_inline = HTML_tag

@@ -424,7 +426,8 @@ def handle_tag(HTML_tag : bs4.element.Tag):
def handle_table(table: bs4.element.Tag):
    global cell, para
    
    col_widths = [int(col.attrs["style"][-3:-1]) for col in table.find_all(lambda tag: tag.name == "col")] #list of each column width in percentages
    #each col tag has a style element with something like "width: xx%", this extracts the xx as an intint(
    col_widths = [int(re.search(r'(\d+)%',col.attrs["style"]).group(1)) for col in table.find_all(lambda tag: tag.name == "col")] #list of each column width in percentages
    rows = [row for row in table.find_all(lambda tag: tag.name == "tr")]

    #create an empty table in docx document 
@@ -480,17 +483,25 @@ def handle_table(table: bs4.element.Tag):

if __name__ == "__main__":
    
    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print("Usage: html_to_docx.py file.html [style.css]")
    if len(sys.argv) != 2:
        print("Usage: html_to_docx.py <Diretory with html and css files>")
        sys.exit(1)

    # Define the directory path
    dir_path = sys.argv[1]
    directory = os.scandir(dir_path)

    # Get list of files (htmls are sorted based on their number)
    html_files = sorted([entry.path for entry in directory if entry.is_file() and entry.name.endswith('.html') if entry.name != "index.html" ], key = lambda x: int(x.split("/")[-1].split("-")[0]))
    css_files = [entry.path for entry in directory if entry.is_file() and entry.name.endswith('.css')]

    if len(sys.argv) == 3:
    #use first css file in alphabetical order (this should by default be API.css)
    if css_files:
        #keeps long color definition
        cssutils.ser.prefs.minimizeColorHash = False
        #parse css
        parser = cssutils.CSSParser()
        sheet = parser.parseFile(sys.argv[2])
        sheet = parser.parseFile(css_files[0])
        for rule in sheet:
            if DEBUG:
                print(rule.selectorText)
@@ -527,18 +538,17 @@ if __name__ == "__main__":
                    
                cssrules[rule.selectorText[1:]] = _style # remove ".' form selector and uses it as key to add style to our rules dictionary

    #parse html
    with open(sys.argv[1], "r") as file:
    #parse htmls
    for html_file in html_files:
        if DEBUG:
            print(f"Parsing: {html_file}")
        with open(html_file, "r") as file:
            html_content = file.read()
            soup = bs4.BeautifulSoup(html_content, "html.parser")
    
    skip = True

        #construct doc
        for tag in soup.body.children:
        #skip to executive summary, everything before that is already in the skeleton
        if tag.string == "Executive summary":
            skip = False
        if not skip:
            handle_tag(tag)


+75 −2
Original line number Diff line number Diff line
@@ -32,12 +32,84 @@ $if(mathjax)$
$endif$
  $math$
$endif$
<script src="dist/bundle.js" defer></script>
  <!--[if lt IE 9]>
    <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
  <![endif]-->
</head>
<body>
  <button style="position: fixed; bottom: 10px; right: 10px;" onclick="toggleStyles()">Toggle Background highlighting</button>
  <button style="position: fixed; bottom: 50px; right: 10px;" id="download_btn">Download HTML</button>
  <label class="switch">Editing
    <input type="checkbox" id="editing">
    <span class="slider round"></span>
  </label>
  <style>
    /* The switch - the box around the slider */
    .switch {
      position: fixed;
      bottom: 90px;
      right: 10px;
      display: inline-block;
      width: 120px;
      height: 34px;
    }

    /* Hide default HTML checkbox */
    .switch input {
      opacity: 0;
      width: 0;
      height: 0;
    }

    /* The slider */
    .slider {
      position: absolute;
      cursor: pointer;
      top: 0;
      left: 60px;
      right: 0;
      bottom: 0;
      background-color: #ccc;
      -webkit-transition: .4s;
      transition: .4s;
    }

    .slider:before {
      position: absolute;
      content: "";
      height: 26px;
      width: 26px;
      left: 4px;
      bottom: 4px;
      background-color: white;
      -webkit-transition: .4s;
      transition: .4s;
    }

    input:checked+.slider {
      background-color: #2196F3;
    }

    input:focus+.slider {
      box-shadow: 0 0 1px #2196F3;
    }

    input:checked+.slider:before {
      -webkit-transform: translateX(26px);
      -ms-transform: translateX(26px);
      transform: translateX(26px);
    }

    /* Rounded sliders */
    .slider.round {
      border-radius: 34px;
    }

    .slider.round:before {
      border-radius: 50%;
    }
  </style>

  <script>
      let styleToggle = false;
@@ -78,17 +150,18 @@ $abstract$
$endif$
</header>
$endif$
$if(toc)$
<nav id="$idprefix$TOC" role="doc-toc">
$if(toc-title)$
<h2 id="$idprefix$toc-title">$toc-title$</h2>
$endif$
<ul><li><h1><a href="0-.html">ETSI title</a></h1></li></ul>
$table-of-contents$
</nav>
$endif$
<div id="editor">
$body$
$for(include-after)$
$include-after$
$endfor$
</div>
</body>
</html>