maintainers/scripts/doc/escape-code-markup.py at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / maintainers / scripts / doc / escape-code-markup.py
at master 3.0 kB view raw
 1#! /usr/bin/env nix-shell
 2#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
 3
 4"""
 5Pandoc will strip any markup within code elements so
 6let’s escape them so that they can be handled manually.
 7"""
 8
 9import lxml.etree as ET
10import re
11import sys
12
13def replace_element_by_text(el: ET.Element, text: str) -> None:
14    """
15    Author: bernulf
16    Source: https://stackoverflow.com/a/10520552/160386
17    SPDX-License-Identifier: CC-BY-SA-3.0
18    """
19    text = text + (el.tail or "")
20    parent = el.getparent()
21    if parent is not None:
22        previous = el.getprevious()
23        if previous is not None:
24            previous.tail = (previous.tail or "") + text
25        else:
26            parent.text = (parent.text or "") + text
27        parent.remove(el)
28
29DOCBOOK_NS = "http://docbook.org/ns/docbook"
30
31# List of elements that pandoc’s DocBook reader strips markup from.
32# https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs
33code_elements = [
34    # CodeBlock
35    "literallayout",
36    "screen",
37    "programlisting",
38    # Code (inline)
39    "classname",
40    "code",
41    "filename",
42    "envar",
43    "literal",
44    "computeroutput",
45    "prompt",
46    "parameter",
47    "option",
48    "markup",
49    "wordasword",
50    "command",
51    "varname",
52    "function",
53    "type",
54    "symbol",
55    "constant",
56    "userinput",
57    "systemitem",
58]
59
60XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"')
61ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>')
62
63def remove_xmlns(match: re.Match) -> str:
64    """
65    Removes xmlns attributes.
66
67    Expects a match containing an opening tag.
68    """
69    return XMLNS_REGEX.sub('', match.group(0))
70
71if __name__ == '__main__':
72    assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>"
73
74    tree = ET.parse(sys.argv[1])
75    name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements])
76
77    for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"):
78        text = ET.tostring(markup, encoding=str)
79
80        # tostring adds xmlns attributes to the element we want to stringify
81        # as if it was supposed to be usable standalone.
82        # We are just converting it to CDATA so we do not care.
83        # Let’s strip the namespace declarations to keep the code clean.
84        #
85        # Note that this removes even namespaces that were potentially
86        # in the original file. Though, that should be very rare –
87        # most of the time, we will stringify empty DocBook elements
88        # like <xref> or <co> or, at worst, <link> with xlink:href attribute.
89        #
90        # Also note that the regex expects the root element to be first
91        # thing in the string. But that should be fine, the tostring method
92        # does not produce XML declaration or doctype by default.
93        text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text)
94
95        replace_element_by_text(markup, text)
96
97    tree.write(sys.argv[2])