Commit 9f4232a5 authored by Serafino's avatar Serafino
Browse files

Initial commit

parents
Loading
Loading
Loading
Loading

API.docx

0 → 100644
+7.52 MiB

File added.

No diff preview for this file type.

ETSI_GS_skeleton.docx

0 → 100644
+105 KiB

File added.

No diff preview for this file type.

filter_1.lua

0 → 100644
+182 −0
Original line number Diff line number Diff line
--THIS LUA FILTER MUST BE APPLIED TO PANDOC IN THE CORRECT ORDER

--global table for references link
references = {}
annex_headers = {}

--helper function to recognize and parse word runs with proper style
function Run(el)
    local MARK = "@#!"
    local insideRun = false
    local tag, span
    local newContent = pandoc.List({})
    
    for _, elem in ipairs(el.content) do
        
        if elem.t == "Str" and elem.text:find(MARK) then -- elem is a string and contains MARK
            local before, after = elem.text:match("(.-)"..MARK.."(.*)") -- before and after are what is writter before or after MARK
            if before or after then -- elem is the start or the end of a word run
                insideRun = not insideRun
                if insideRun then -- we found a new run section
                    if before ~= "" then --we append what's before MARK too
                        newContent:insert(pandoc.Str(before))
                    end
                    tag = "" --reset tag
                    span = pandoc.Span("") --create new span
                else --we just reached the end of a run, so we must use our span 
                    if before ~= "" then --append what's before MARK to the span
                        span.content:insert(pandoc.Str(before))
                    end
                     --remove space after tag if the span is not only composed of a single space
                    if #span.content > 1 then
                        span.content:remove(1)
                    end
                    span.attr = { class = tag} -- set style
                    if span.content then --do not insert empty spans
                        newContent:insert(span) --insert our styled text in the parent element content
                    end
                    secondStart, secondEnd = after:find(MARK) 
                    if secondStart then --we have another run just after this one
                        if after:sub(1, secondStart-1) ~= "" then --there is something between the two runs so we append it
                            newContent:insert(pandoc.Str(after:sub(1, secondStart-1)))
                        end
                        --reset everything and set inside to True
                        insideRun = true
                        tag = ""
                        span = pandoc.Span("")
                    elseif after ~= "" then -- we append what's after MARK too
                        newContent:insert(pandoc.Str(after))
                    end
                end
            end
        elseif insideRun then --we incorporate everything in the span as it is
            if tag == "" then --tag is 1st string after MARK
                if elem.t == "Str" then --we need to assign tag
                    tag = elem.text
                end
            else -- we need to append all the text to our span
                span.content:insert(elem)
            end
        else --we are not in a special run so we just copy the rest of the content as it is
            newContent:insert(elem)
        end
    end
    return newContent
end

--heleper function to apply paragraphs styles
function Style(el)
    -- search for paragraph style
    local text = pandoc.utils.stringify(el)
    local startIndex, endIndex, tag = text:find("%[{%[%-%-(.-)%-%-%]}%]") --pattern used in preprocessing.py to mark paragraphs

    if endIndex == #text then -- paragraph styles are always tagged at the end of the paragraph
        table.remove(el.content) -- remove the tag string [{[--tag--]}] from para or header
        table.remove(el.content) -- remove the space before tag from para or header
        if tag == "TF"  and el.content[1] and el.content[3] then --this is the description of an image
            return pandoc.Div(el.content, { class = tag , id = el.content[1].text.."_"..el.content[3].text:sub(1,-2)}) --id is Figure_x.x.x
        elseif tag == "TH" and el.content[1] and el.content[3] then  -- this is a table title
            return pandoc.Div(el.content, { class = tag , id = el.content[1].text.."_"..el.content[3].text:sub(1,-2)}) --id is Table_x.x.x
        end
        return pandoc.Div(el.content, { class = tag }) --apply style to para header, by returning a div without the tag, but with the corresponding style class
    else
        return el 
    end
end

--helper function to generate references
function Reference(el)
    local text = pandoc.utils.stringify(el)
    for prefix, number in text:gmatch("%[(i?%.?)(%d+)%]") do
        if number then
            local key = (prefix or "")..number -- just the number or the prefix + number if prefix is not null
            if text:sub(2, #key+1) == key then --this is a reference in 2.1 or 2.2
                references[key] = "#"..key --we save the reerence in the global table
                return pandoc.Div(el.content, {id = key, class = el.attr["classes"][1]})
            end
        end
    end
    return el
end

--Pandoc filters
if FORMAT:match 'html' then
    function Str(el)
        --rever TAB marks back to \tS
        local text = el.text:gsub("{{{{TAB}}}}", "\t")
        return pandoc.Str(text)
    end

    function Para(el)
        --search for runs marked by @#!
        el.content = Run(el)
        --search for paragraph style
        el = Style(el)
        el = Reference(el)
        return el        
    end

    function Plain(el)
        --search for runs marked by @#!
        el.content = Run(el)
        --search for paragraph style
        return Style(el)       
    end

    function Emph(el)
        --search for runs marked by @#!
        el.content = Run(el)
        return el
    end

    function Strong(el)
        --search for runs marked by @#!
        el.content = Run(el)
        return el
    end

    function Superscript(el)
        el.content = Run(el)
        return el
    end

    function Header(el)
        el.content = Run(el)
        -- fix annex headers
        if el.level == 8 then
            annex_headers[el.content[3].text] =  "#"..el.attr.identifier -- annex letter / link correspondance
        end
        --search for style
        return Style(el) 
    end


    function Image(el)
        local filePath, extension = el.src:match("(.*)%.(.*)$") -- image.png, jpeg or emf
        --fixes extensions
        if extension == "emf" then 
            el.src = filePath..".png"
            return el
        end
        --adds yellow overlay on top of the image
        if extension == "png" then
            local overlay = pandoc.Span({}, { style = "position: absolute; top: 0; right: 0; bottom: 0; left: 0; background-color: rgba(255, 255, 0, 0.5); pointer-events: none; z-index: 1;", class = "image_overlay"})
            el.attr = { style = "width: 100%; height: auto; position: relative;"}
            return pandoc.Span({el, overlay}, { style = "position: relative; display: inline-flex;" })
        end
    end

    function Pandoc(el)
        --save references for second filter
        local fp = 'media/references.json'
        local mt = 'text/json'
        pandoc.mediabag.insert(fp, mt, pandoc.json.encode(references))
        --save annex_headers for second filter
        local fp = 'media/annex.json'
        pandoc.mediabag.insert(fp, mt, pandoc.json.encode(annex_headers))
        --save toc
        local fp = 'media/toc.json'
        pandoc.mediabag.insert(fp, mt, pandoc.json.encode(pandoc.structure.table_of_contents(el, { toc_depth = 4})))
    end
end
    

filter_2.lua

0 → 100644
+262 −0
Original line number Diff line number Diff line
--THIS LUA FILTER MUST BE APPLIED TO PANDOC IN THE CORRECT ORDER
local debug = false

local mt, references = pandoc.mediabag.fetch("media/references.json") --references generated in the first filter
references = pandoc.json.decode(references, false) --false means "use lua tables, instead of pandoc objects"

local mt, annex_headers = pandoc.mediabag.fetch("media/annex.json") --annex_headers generated in the first filter
annex_headers = pandoc.json.decode(annex_headers, false) --false means "use lua tables, instead of pandoc objects"

--pandoc generated toc
local mt, toc = pandoc.mediabag.fetch("media/toc.json") --references generated in the first filter
toc = pandoc.json.decode(toc, true) --true means "use pandoc objects"
--process it to have a nicer, easier to acces, table
local tocLinks = {}
pandoc.walk_block(toc, {
    Link = function(el)
        if el.content[1].t == "Str" then
            local startIndex, endIndex, number =  el.content[1].text:find("(%w*%.?%d*%.?%d*%.?%d+%-?%d*)") 
            if number then
                tocLinks[number] = el.target --save links to the corresponding clause number
            end
        end
    end
})


--generates link for caluses
function ClauseLink(text, number)
    text = text:gsub("%-", "‑") --always use non-breaking hyphens for links
    return pandoc.Link(text, tocLinks[number])
end

--generates link for figures
function FigureLink(text, number)
    text = text:gsub("%-", "‑") --always use non-breaking hyphens for links
    return pandoc.Link(text, "#Figure_"..number)
end

--generates link for tables
function TableLink(text, number)
    text = text:gsub("%-", "‑") --always use non-breaking hyphens for links
    return pandoc.Link(text, "#Table_"..number)
end

--generates link for annexes
function AnnexLink(text, number)
    return pandoc.Link(text, annex_headers[number])
end


--helper function that uses the generated toc to link clauses and figures to the respetive header
function Substitute(el, word)
    local newContent = pandoc.List({})
    local pattern = "(%w*%.?%d*%.?%d*%.?%d+%-?%d*)"
    if word == "annex" then
        pattern = "(%u)"
    end

    local i = 1
    while el.content[i] do
        elem = el.content[i]
        if elem.t == "Str" and elem.text:lower():find(word) then --check the next strings to see if we need to link it
            
            local startIndex, endIndex, number = elem.text:gsub("‑", "-"):find(pattern) --check if number is in the same Str elem
            if number and tocLinks[number] then --create link
                newContent:insert(pandoc.Link(elem.text:sub(1,endIndex), tocLinks[number]))
                if endIndex < #elem.text then -- if something remains (like a comma or a bracket), append it too
                    newContent:insert(pandoc.Str(elem.text:sub(endIndex+1,-1)))
                end
                goto continue
            end
            local succ = el.content[i+2] --next string if it exists should be number
            if succ and succ.t == "Str" then
                number = succ.text:gsub("‑", "-"):match(pattern)
                if succ.t == "Str" and number then --we continue searching
                    local succ_succ = el.content[i+4] -- next next string if it exists and is of could link to other documents
                    if succ_succ and succ_succ.t == "Str" and succ_succ.text == "of" then -- this refers to another document
                        if debug then print(word.." referring to another document", succ.text, succ_succ.text, el.content[i+6]) end
                    else --we finally sustitute it with the proper link
                        if word == "clause" and tocLinks[number] ~= nil then
                            newContent:insert(ClauseLink(elem.text.." "..number, number))
                        elseif word == "figure" then
                            newContent:insert(FigureLink(elem.text.." "..number, number))
                        elseif word == "table" then
                            newContent:insert(TableLink(elem.text.." "..number, number))
                        elseif word == "annex" then
                            newContent:insert(AnnexLink(elem.text.." "..number, number))
                        else
                            if debug then print("Unkown behavior for "..word.." or link does not exists") end
                        end
                        if debug then print(succ.text, number) end
                        text, substitutions = succ.text:gsub("‑", "-") --if we substituted we need to account for this, because nbh is not acii anf thus is more then one byte
                        local startIndex, endIndex = text:find(pattern)
                        if endIndex+substitutions*2 < #succ.text then -- if something remains (like a comma or a bracket), append it too
                            newContent:insert(pandoc.Str(succ.text:sub(endIndex+substitutions*2+1,-1)))
                        end
                        i = i + 2 --we skip a space and a string because we insert them manually in the link content
                        goto continue
                    end
                else
                    if debug then
                        print("Error: type is not Str or number does not follow")
                        print(elem.text, succ.t, succ.text, number)
                    end
                end
            end
        end
        --append the other elements normally
        newContent:insert(elem)
        ::continue::
        i = i + 1
    end
    return newContent 
end

function MultipleClauses(el)
    local newContent = pandoc.List({})
    local pattern = "(%w*%.?%d*%.?%d*%.?%d+%-?%d*)"
    local calusesFound = false --this is true when we found the word "clauses" and remain true until we habe clauses numbers following

    local i = 1
    while el.content[i] do
        elem = el.content[i]
        
        if elem.t == "Str" then
            --this may be the start of a list of multiple clauses
            if elem.text:lower():find("clauses") then
                calusesFound = true
                newContent:insert(elem)
                goto continue
            end

            if calusesFound then
                --check if we found a clause number
                local startIndex, endIndex, number = elem.text:find(pattern)

                if elem.text:match("^,$") or elem.text:match("and") then
                    --this is just a comma or the word 'and' separating the clauses
                    --do nothing
                elseif number then
                    -- we did in fact find a clause number
                    --substitute with linkS
                    newContent:insert(ClauseLink(elem.text:sub(startIndex, endIndex), number))
                    if endIndex <  #elem.text then
                        --add the rest of the string too
                        newContent:insert(pandoc.Str(elem.text:sub(endIndex+1, -1)))
                    end
                    goto continue --skip to the nex element
                else
                    --we found something that is neither a comma, the word 'and' or a clause number,
                    --so we can say this is not a list of multiple clauses
                    calusesFound = false
                end
            end
        end
                
        newContent:insert(elem)
        ::continue::
        i = i + 1
    end
    return newContent
end

--helper functions to fix Plain encapsulated div contents, this is an artefact due to how pandoc creates cutom divs in filter_1.lua
function isAllPlain(el)
    if not el.content then --if content is empty
        return false
    end
    for _, elem in ipairs(el.content) do --check every elem for Plain type
        if elem.t ~= "Plain" then
            return false
        end
    end
    return true
end

function Normalize(el)
    local newContent = pandoc.List()
    if isAllPlain(el) then
        for _, elem in ipairs(el.content) do
            for _, element in ipairs(elem.content) do
                newContent:insert(element)
            end
        end
    else
        return el.content
    end
    return newContent
end

function Linking(el)
    --substitute clause number string with link
    el.content = MultipleClauses(el)
    el.content = Substitute(el, "clause")
    el.content = Substitute(el, "figure")
    el.content = Substitute(el, "table")
    el.content = Substitute(el, "annex")
    return el
end

--Pandoc filters
if FORMAT:match 'html' then
    function Div(el)
        --normalize div
        el.content = Normalize(el)

        --substitute clause number string with link
        el.content = MultipleClauses(el)
        el.content = Substitute(el, "clause")
        if el.classes[1] ~= "TF" then
            el.content = Substitute(el, "figure")
        end
        if el.classes[1] ~= "TH" then
            el.content = Substitute(el, "table")
        end
        el.content = Substitute(el, "annex")
        return el        
    end

    function Para(el)
        --normalize Para
        el.content = Normalize(el)

        el = Linking(el)
        return el        
    end

    
    function Plain(el)
        el = Linking(el)
        return el        
    end

    function Strong(el)
        el = Linking(el)
        return el        
    end

    function Emph(el)
        el = Linking(el)
        return el        
    end

    function Underline(el)
        el = Linking(el)
        return el        
    end

    function Span(el)
        el = Linking(el)
        return el
    end

    function Str(el)
        --substitute reference with link
        local startIndex, endIndex, reference, key = el.text:find("(%[(i?%.?%d+)%])")
        if reference and references[key] then --reference found
            return pandoc.Span({pandoc.Str(el.text:sub(1,startIndex-1)), pandoc.Link(reference, references[key]), pandoc.Str(el.text:sub(endIndex+1))}) --start of the string + Link + last bit of string
        end 
    end
end
 No newline at end of file

html_to_docx.py

0 → 100644
+0 −0

File added.

Preview size limit exceeded, changes collapsed.