social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2import cross
3
4class HTMLPostTokenizer(HTMLParser):
5 def __init__(self) -> None:
6 super().__init__()
7 self.tokens: list[cross.Token] = []
8 self.status: dict
9
10 self.mentions: list[tuple[str, str]]
11 self.tags: list[str]
12
13 self.in_pre = False
14 self.in_code = False
15
16 self.current_tag_stack = []
17 self.list_stack = []
18
19 self.anchor_stack = []
20 self.anchor_data = []
21
22 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
23 attrs_dict = dict(attrs)
24
25 def append_newline():
26 if self.tokens:
27 last_token = self.tokens[-1]
28 if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
29 self.tokens.append(cross.TextToken('\n'))
30
31 if tag == 'br':
32 self.tokens.append(cross.TextToken(' \n'))
33
34 elif tag == 'a':
35 href = attrs_dict.get('href', '')
36 self.anchor_stack.append(href)
37
38 elif tag == 'strong' or tag == 'b':
39 self.tokens.append(cross.TextToken('**'))
40
41 elif tag == 'em' or tag == 'i':
42 self.tokens.append(cross.TextToken('*'))
43
44 elif tag == 'del' or tag == 's':
45 self.tokens.append(cross.TextToken('~~'))
46
47 elif tag == 'code':
48 if not self.in_pre:
49 self.tokens.append(cross.TextToken('`'))
50 self.in_code = True
51
52 elif tag == 'pre':
53 append_newline()
54 self.tokens.append(cross.TextToken('```\n'))
55 self.in_pre = True
56
57 elif tag == 'blockquote':
58 append_newline()
59 self.tokens.append(cross.TextToken('\n> '))
60
61 elif tag == 'ul':
62 self.list_stack.append('ul')
63 append_newline()
64
65 elif tag == 'ol':
66 self.list_stack.append('ol')
67 append_newline()
68
69 elif tag == 'li':
70 indent = ' ' * (len(self.list_stack) - 1)
71 if self.list_stack and self.list_stack[-1] == 'ul':
72 self.tokens.append(cross.TextToken(f'{indent}- '))
73 elif self.list_stack and self.list_stack[-1] == 'ol':
74 self.tokens.append(cross.TextToken(f'{indent}1. '))
75
76 elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
77 level = int(tag[1])
78 self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
79
80 self.current_tag_stack.append(tag)
81
82 def handle_data(self, data: str) -> None:
83 if self.anchor_stack:
84 self.anchor_data.append(data)
85 else:
86 self.tokens.append(cross.TextToken(data))
87
88 def handle_endtag(self, tag: str) -> None:
89 if not self.current_tag_stack:
90 return
91
92 if tag in self.current_tag_stack:
93 self.current_tag_stack.remove(tag)
94
95 if tag == 'p':
96 self.tokens.append(cross.TextToken('\n\n'))
97
98 elif tag == 'a':
99 href = self.anchor_stack.pop()
100 anchor_data = ''.join(self.anchor_data)
101 self.anchor_data = []
102
103 if anchor_data.startswith('#'):
104 as_tag = anchor_data[1:].lower()
105 if any(as_tag == block for block in self.tags):
106 self.tokens.append(cross.TagToken(anchor_data[1:]))
107 elif anchor_data.startswith('@'):
108 match = next(
109 (pair for pair in self.mentions if anchor_data in pair),
110 None
111 )
112
113 if match:
114 self.tokens.append(cross.MentionToken(match[1], ''))
115 else:
116 self.tokens.append(cross.LinkToken(href, anchor_data))
117
118 elif tag == 'strong' or tag == 'b':
119 self.tokens.append(cross.TextToken('**'))
120
121 elif tag == 'em' or tag == 'i':
122 self.tokens.append(cross.TextToken('*'))
123
124 elif tag == 'del' or tag == 's':
125 self.tokens.append(cross.TextToken('~~'))
126
127 elif tag == 'code':
128 if not self.in_pre and self.in_code:
129 self.tokens.append(cross.TextToken('`'))
130 self.in_code = False
131
132 elif tag == 'pre':
133 self.tokens.append(cross.TextToken('\n```\n'))
134 self.in_pre = False
135
136 elif tag == 'blockquote':
137 self.tokens.append(cross.TextToken('\n'))
138
139 elif tag == 'ul' or tag == 'ol':
140 if self.list_stack:
141 self.list_stack.pop()
142 self.tokens.append(cross.TextToken('\n'))
143
144 elif tag == 'li':
145 self.tokens.append(cross.TextToken('\n'))
146
147 elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
148 self.tokens.append(cross.TextToken('\n'))
149
150 def get_tokens(self) -> list[cross.Token]:
151 if not self.tokens:
152 return []
153
154 combined: list[cross.Token] = []
155 buffer: list[str] = []
156
157 def flush_buffer():
158 if buffer:
159 merged = ''.join(buffer)
160 combined.append(cross.TextToken(text=merged))
161 buffer.clear()
162
163 for token in self.tokens:
164 if isinstance(token, cross.TextToken):
165 buffer.append(token.text)
166 else:
167 flush_buffer()
168 combined.append(token)
169
170 flush_buffer()
171
172 if combined and isinstance(combined[-1], cross.TextToken):
173 if combined[-1].text.endswith('\n\n'):
174 combined[-1] = cross.TextToken(combined[-1].text[:-2])
175 return combined
176
177 def reset(self):
178 """Reset the parser state for reuse."""
179 super().reset()
180 self.tokens = []
181
182 self.mentions = []
183 self.tags = []
184
185 self.in_pre = False
186 self.in_code = False
187
188 self.current_tag_stack = []
189 self.anchor_stack = []
190 self.list_stack = []