social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2from typing import override
3
4from cross.tokens import LinkToken, TextToken, Token
5from util.splitter import canonical_label
6
7
8class HTMLToTokensParser(HTMLParser):
9 def __init__(self) -> None:
10 super().__init__()
11 self.tokens: list[Token] = []
12
13 self._tag_stack: dict[str, tuple[str, dict[str, str | None]]] = {}
14 self.in_pre: bool = False
15 self.in_code: bool = False
16 self.invisible: bool = False
17
18 def handle_a_endtag(self):
19 label, _attr = self._tag_stack.pop("a")
20
21 href = _attr.get("href")
22 if href:
23 if canonical_label(label, href):
24 self.tokens.append(LinkToken(href=href))
25 else:
26 self.tokens.append(LinkToken(href=href, label=label))
27
28 def append_text(self, text: str):
29 self.tokens.append(TextToken(text=text))
30
31 def append_newline(self):
32 if self.tokens:
33 last_token = self.tokens[-1]
34 if isinstance(last_token, TextToken) and not last_token.text.endswith("\n"):
35 self.tokens.append(TextToken(text="\n"))
36
37 @override
38 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
39 _attr = dict(attrs)
40
41 if self.invisible:
42 return
43
44 match tag:
45 case "p":
46 cls = _attr.get("class", "")
47 if cls and "quote-inline" in cls:
48 self.invisible = True
49 case "a":
50 self._tag_stack["a"] = ("", _attr)
51 case "code":
52 if not self.in_pre:
53 self.append_text("`")
54 self.in_code = True
55 case "pre":
56 self.append_newline()
57 self.append_text("```\n")
58 self.in_pre = True
59 case "blockquote":
60 self.append_newline()
61 self.append_text("> ")
62 case "strong" | "b":
63 self.append_text("**")
64 case "em" | "i":
65 self.append_text("*")
66 case "del" | "s":
67 self.append_text("~~")
68 case "br":
69 self.append_text("\n")
70 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
71 level = int(tag[1])
72 self.append_text("\n" + "#" * level + " ")
73 case _:
74 # self.builder.extend(f"<{tag}>".encode("utf-8"))
75 pass
76
77 @override
78 def handle_endtag(self, tag: str) -> None:
79 if self.invisible:
80 if tag == "p":
81 self.invisible = False
82 return
83
84 match tag:
85 case "a":
86 if "a" in self._tag_stack:
87 self.handle_a_endtag()
88 case "code":
89 if not self.in_pre and self.in_code:
90 self.append_text("`")
91 self.in_code = False
92 case "pre":
93 self.append_newline()
94 self.append_text("```\n")
95 self.in_pre = False
96 case "blockquote":
97 self.append_text("\n")
98 case "strong" | "b":
99 self.append_text("**")
100 case "em" | "i":
101 self.append_text("*")
102 case "del" | "s":
103 self.append_text("~~")
104 case "p":
105 self.append_text("\n\n")
106 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
107 self.append_text("\n")
108 case _:
109 # self.builder.extend(f"</{tag}>".encode("utf-8"))
110 pass
111
112 @override
113 def handle_data(self, data: str) -> None:
114 if self.invisible:
115 return
116
117 if self._tag_stack.get('a'):
118 label, _attr = self._tag_stack.pop("a")
119 self._tag_stack["a"] = (label + data, _attr)
120 return
121
122 def get_result(self) -> list[Token]:
123 if not self.tokens:
124 return []
125
126 combined: list[Token] = []
127 buffer: list[str] = []
128
129 def flush_buffer():
130 if buffer:
131 merged = "".join(buffer)
132 combined.append(TextToken(text=merged))
133 buffer.clear()
134
135 for token in self.tokens:
136 if isinstance(token, TextToken):
137 buffer.append(token.text)
138 else:
139 flush_buffer()
140 combined.append(token)
141
142 flush_buffer()
143
144 if combined and isinstance(combined[-1], TextToken):
145 if combined[-1].text.endswith("\n\n"):
146 combined[-1] = TextToken(text=combined[-1].text[:-2])
147
148 if combined[-1].text.endswith("\n"):
149 combined[-1] = TextToken(text=combined[-1].text[:-1])
150 return combined