social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2
3import cross
4
5
6class HTMLPostTokenizer(HTMLParser):
7 def __init__(self) -> None:
8 super().__init__()
9 self.tokens: list[cross.Token] = []
10
11 self.mentions: list[tuple[str, str]]
12 self.tags: list[str]
13
14 self.in_pre = False
15 self.in_code = False
16
17 self.current_tag_stack = []
18 self.list_stack = []
19
20 self.anchor_stack = []
21 self.anchor_data = []
22
23 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
24 attrs_dict = dict(attrs)
25
26 def append_newline():
27 if self.tokens:
28 last_token = self.tokens[-1]
29 if isinstance(
30 last_token, cross.TextToken
31 ) and not last_token.text.endswith("\n"):
32 self.tokens.append(cross.TextToken("\n"))
33
34 match tag:
35 case "br":
36 self.tokens.append(cross.TextToken(" \n"))
37 case "a":
38 href = attrs_dict.get("href", "")
39 self.anchor_stack.append(href)
40 case "strong", "b":
41 self.tokens.append(cross.TextToken("**"))
42 case "em", "i":
43 self.tokens.append(cross.TextToken("*"))
44 case "del", "s":
45 self.tokens.append(cross.TextToken("~~"))
46 case "code":
47 if not self.in_pre:
48 self.tokens.append(cross.TextToken("`"))
49 self.in_code = True
50 case "pre":
51 append_newline()
52 self.tokens.append(cross.TextToken("```\n"))
53 self.in_pre = True
54 case "blockquote":
55 append_newline()
56 self.tokens.append(cross.TextToken("> "))
57 case "ul", "ol":
58 self.list_stack.append(tag)
59 append_newline()
60 case "li":
61 indent = " " * (len(self.list_stack) - 1)
62 if self.list_stack and self.list_stack[-1] == "ul":
63 self.tokens.append(cross.TextToken(f"{indent}- "))
64 elif self.list_stack and self.list_stack[-1] == "ol":
65 self.tokens.append(cross.TextToken(f"{indent}1. "))
66 case _:
67 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
68 level = int(tag[1])
69 self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
70
71 self.current_tag_stack.append(tag)
72
73 def handle_data(self, data: str) -> None:
74 if self.anchor_stack:
75 self.anchor_data.append(data)
76 else:
77 self.tokens.append(cross.TextToken(data))
78
79 def handle_endtag(self, tag: str) -> None:
80 if not self.current_tag_stack:
81 return
82
83 if tag in self.current_tag_stack:
84 self.current_tag_stack.remove(tag)
85
86 match tag:
87 case "p":
88 self.tokens.append(cross.TextToken("\n\n"))
89 case "a":
90 href = self.anchor_stack.pop()
91 anchor_data = "".join(self.anchor_data)
92 self.anchor_data = []
93
94 if anchor_data.startswith("#"):
95 as_tag = anchor_data[1:].lower()
96 if any(as_tag == block for block in self.tags):
97 self.tokens.append(cross.TagToken(anchor_data[1:]))
98 elif anchor_data.startswith("@"):
99 match = next(
100 (pair for pair in self.mentions if anchor_data in pair), None
101 )
102
103 if match:
104 self.tokens.append(cross.MentionToken(match[1], ""))
105 else:
106 self.tokens.append(cross.LinkToken(href, anchor_data))
107 case "strong", "b":
108 self.tokens.append(cross.TextToken("**"))
109 case "em", "i":
110 self.tokens.append(cross.TextToken("*"))
111 case "del", "s":
112 self.tokens.append(cross.TextToken("~~"))
113 case "code":
114 if not self.in_pre and self.in_code:
115 self.tokens.append(cross.TextToken("`"))
116 self.in_code = False
117 case "pre":
118 self.tokens.append(cross.TextToken("\n```\n"))
119 self.in_pre = False
120 case "blockquote":
121 self.tokens.append(cross.TextToken("\n"))
122 case "ul", "ol":
123 if self.list_stack:
124 self.list_stack.pop()
125 self.tokens.append(cross.TextToken("\n"))
126 case "li":
127 self.tokens.append(cross.TextToken("\n"))
128 case _:
129 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
130 self.tokens.append(cross.TextToken("\n"))
131
132 def get_tokens(self) -> list[cross.Token]:
133 if not self.tokens:
134 return []
135
136 combined: list[cross.Token] = []
137 buffer: list[str] = []
138
139 def flush_buffer():
140 if buffer:
141 merged = "".join(buffer)
142 combined.append(cross.TextToken(text=merged))
143 buffer.clear()
144
145 for token in self.tokens:
146 if isinstance(token, cross.TextToken):
147 buffer.append(token.text)
148 else:
149 flush_buffer()
150 combined.append(token)
151
152 flush_buffer()
153
154 if combined and isinstance(combined[-1], cross.TextToken):
155 if combined[-1].text.endswith("\n\n"):
156 combined[-1] = cross.TextToken(combined[-1].text[:-2])
157 return combined
158
159 def reset(self):
160 """Reset the parser state for reuse."""
161 super().reset()
162 self.tokens = []
163
164 self.mentions = []
165 self.tags = []
166
167 self.in_pre = False
168 self.in_code = False
169
170 self.current_tag_stack = []
171 self.anchor_stack = []
172 self.list_stack = []