util/html_util.py at master · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util / html_util.py
at master 6.0 kB view raw
  1from html.parser import HTMLParser
  2
  3import cross
  4
  5
  6class HTMLPostTokenizer(HTMLParser):
  7    def __init__(self) -> None:
  8        super().__init__()
  9        self.tokens: list[cross.Token] = []
 10
 11        self.mentions: list[tuple[str, str]]
 12        self.tags: list[str]
 13
 14        self.in_pre = False
 15        self.in_code = False
 16
 17        self.current_tag_stack = []
 18        self.list_stack = []
 19
 20        self.anchor_stack = []
 21        self.anchor_data = []
 22
 23    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
 24        attrs_dict = dict(attrs)
 25
 26        def append_newline():
 27            if self.tokens:
 28                last_token = self.tokens[-1]
 29                if isinstance(
 30                    last_token, cross.TextToken
 31                ) and not last_token.text.endswith("\n"):
 32                    self.tokens.append(cross.TextToken("\n"))
 33
 34        match tag:
 35            case "br":
 36                self.tokens.append(cross.TextToken("  \n"))
 37            case "a":
 38                href = attrs_dict.get("href", "")
 39                self.anchor_stack.append(href)
 40            case "strong", "b":
 41                self.tokens.append(cross.TextToken("**"))
 42            case "em", "i":
 43                self.tokens.append(cross.TextToken("*"))
 44            case "del", "s":
 45                self.tokens.append(cross.TextToken("~~"))
 46            case "code":
 47                if not self.in_pre:
 48                    self.tokens.append(cross.TextToken("`"))
 49                    self.in_code = True
 50            case "pre":
 51                append_newline()
 52                self.tokens.append(cross.TextToken("```\n"))
 53                self.in_pre = True
 54            case "blockquote":
 55                append_newline()
 56                self.tokens.append(cross.TextToken("> "))
 57            case "ul", "ol":
 58                self.list_stack.append(tag)
 59                append_newline()
 60            case "li":
 61                indent = "  " * (len(self.list_stack) - 1)
 62                if self.list_stack and self.list_stack[-1] == "ul":
 63                    self.tokens.append(cross.TextToken(f"{indent}- "))
 64                elif self.list_stack and self.list_stack[-1] == "ol":
 65                    self.tokens.append(cross.TextToken(f"{indent}1. "))
 66            case _:
 67                if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
 68                    level = int(tag[1])
 69                    self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
 70
 71        self.current_tag_stack.append(tag)
 72
 73    def handle_data(self, data: str) -> None:
 74        if self.anchor_stack:
 75            self.anchor_data.append(data)
 76        else:
 77            self.tokens.append(cross.TextToken(data))
 78
 79    def handle_endtag(self, tag: str) -> None:
 80        if not self.current_tag_stack:
 81            return
 82
 83        if tag in self.current_tag_stack:
 84            self.current_tag_stack.remove(tag)
 85
 86        match tag:
 87            case "p":
 88                self.tokens.append(cross.TextToken("\n\n"))
 89            case "a":
 90                href = self.anchor_stack.pop()
 91                anchor_data = "".join(self.anchor_data)
 92                self.anchor_data = []
 93
 94                if anchor_data.startswith("#"):
 95                    as_tag = anchor_data[1:].lower()
 96                    if any(as_tag == block for block in self.tags):
 97                        self.tokens.append(cross.TagToken(anchor_data[1:]))
 98                elif anchor_data.startswith("@"):
 99                    match = next(
100                        (pair for pair in self.mentions if anchor_data in pair), None
101                    )
102
103                    if match:
104                        self.tokens.append(cross.MentionToken(match[1], ""))
105                else:
106                    self.tokens.append(cross.LinkToken(href, anchor_data))
107            case "strong", "b":
108                self.tokens.append(cross.TextToken("**"))
109            case "em", "i":
110                self.tokens.append(cross.TextToken("*"))
111            case "del", "s":
112                self.tokens.append(cross.TextToken("~~"))
113            case "code":
114                if not self.in_pre and self.in_code:
115                    self.tokens.append(cross.TextToken("`"))
116                    self.in_code = False
117            case "pre":
118                self.tokens.append(cross.TextToken("\n```\n"))
119                self.in_pre = False
120            case "blockquote":
121                self.tokens.append(cross.TextToken("\n"))
122            case "ul", "ol":
123                if self.list_stack:
124                    self.list_stack.pop()
125                self.tokens.append(cross.TextToken("\n"))
126            case "li":
127                self.tokens.append(cross.TextToken("\n"))
128            case _:
129                if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
130                    self.tokens.append(cross.TextToken("\n"))
131
132    def get_tokens(self) -> list[cross.Token]:
133        if not self.tokens:
134            return []
135
136        combined: list[cross.Token] = []
137        buffer: list[str] = []
138
139        def flush_buffer():
140            if buffer:
141                merged = "".join(buffer)
142                combined.append(cross.TextToken(text=merged))
143                buffer.clear()
144
145        for token in self.tokens:
146            if isinstance(token, cross.TextToken):
147                buffer.append(token.text)
148            else:
149                flush_buffer()
150                combined.append(token)
151
152        flush_buffer()
153
154        if combined and isinstance(combined[-1], cross.TextToken):
155            if combined[-1].text.endswith("\n\n"):
156                combined[-1] = cross.TextToken(combined[-1].text[:-2])
157        return combined
158
159    def reset(self):
160        """Reset the parser state for reuse."""
161        super().reset()
162        self.tokens = []
163
164        self.mentions = []
165        self.tags = []
166
167        self.in_pre = False
168        self.in_code = False
169
170        self.current_tag_stack = []
171        self.anchor_stack = []
172        self.list_stack = []