social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2import cross
3
4class HTMLPostTokenizer(HTMLParser):
5 def __init__(self) -> None:
6 super().__init__()
7 self.tokens: list[cross.Token] = []
8
9 self.mentions: list[tuple[str, str]]
10 self.tags: list[str]
11
12 self.in_pre = False
13 self.in_code = False
14
15 self.current_tag_stack = []
16 self.list_stack = []
17
18 self.anchor_stack = []
19 self.anchor_data = []
20
21 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
22 attrs_dict = dict(attrs)
23
24 def append_newline():
25 if self.tokens:
26 last_token = self.tokens[-1]
27 if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
28 self.tokens.append(cross.TextToken('\n'))
29
30 match tag:
31 case 'br':
32 self.tokens.append(cross.TextToken(' \n'))
33 case 'a':
34 href = attrs_dict.get('href', '')
35 self.anchor_stack.append(href)
36 case 'strong', 'b':
37 self.tokens.append(cross.TextToken('**'))
38 case 'em', 'i':
39 self.tokens.append(cross.TextToken('*'))
40 case 'del', 's':
41 self.tokens.append(cross.TextToken('~~'))
42 case 'code':
43 if not self.in_pre:
44 self.tokens.append(cross.TextToken('`'))
45 self.in_code = True
46 case 'pre':
47 append_newline()
48 self.tokens.append(cross.TextToken('```\n'))
49 self.in_pre = True
50 case 'blockquote':
51 append_newline()
52 self.tokens.append(cross.TextToken('> '))
53 case 'ul', 'ol':
54 self.list_stack.append(tag)
55 append_newline()
56 case 'li':
57 indent = ' ' * (len(self.list_stack) - 1)
58 if self.list_stack and self.list_stack[-1] == 'ul':
59 self.tokens.append(cross.TextToken(f'{indent}- '))
60 elif self.list_stack and self.list_stack[-1] == 'ol':
61 self.tokens.append(cross.TextToken(f'{indent}1. '))
62 case _:
63 if tag in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
64 level = int(tag[1])
65 self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
66
67 self.current_tag_stack.append(tag)
68
69 def handle_data(self, data: str) -> None:
70 if self.anchor_stack:
71 self.anchor_data.append(data)
72 else:
73 self.tokens.append(cross.TextToken(data))
74
75 def handle_endtag(self, tag: str) -> None:
76 if not self.current_tag_stack:
77 return
78
79 if tag in self.current_tag_stack:
80 self.current_tag_stack.remove(tag)
81
82 match tag:
83 case 'p':
84 self.tokens.append(cross.TextToken('\n\n'))
85 case 'a':
86 href = self.anchor_stack.pop()
87 anchor_data = ''.join(self.anchor_data)
88 self.anchor_data = []
89
90 if anchor_data.startswith('#'):
91 as_tag = anchor_data[1:].lower()
92 if any(as_tag == block for block in self.tags):
93 self.tokens.append(cross.TagToken(anchor_data[1:]))
94 elif anchor_data.startswith('@'):
95 match = next(
96 (pair for pair in self.mentions if anchor_data in pair),
97 None
98 )
99
100 if match:
101 self.tokens.append(cross.MentionToken(match[1], ''))
102 else:
103 self.tokens.append(cross.LinkToken(href, anchor_data))
104 case 'strong', 'b':
105 self.tokens.append(cross.TextToken('**'))
106 case 'em', 'i':
107 self.tokens.append(cross.TextToken('*'))
108 case 'del', 's':
109 self.tokens.append(cross.TextToken('~~'))
110 case 'code':
111 if not self.in_pre and self.in_code:
112 self.tokens.append(cross.TextToken('`'))
113 self.in_code = False
114 case 'pre':
115 self.tokens.append(cross.TextToken('\n```\n'))
116 self.in_pre = False
117 case 'blockquote':
118 self.tokens.append(cross.TextToken('\n'))
119 case 'ul', 'ol':
120 if self.list_stack:
121 self.list_stack.pop()
122 self.tokens.append(cross.TextToken('\n'))
123 case 'li':
124 self.tokens.append(cross.TextToken('\n'))
125 case _:
126 if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
127 self.tokens.append(cross.TextToken('\n'))
128
129 def get_tokens(self) -> list[cross.Token]:
130 if not self.tokens:
131 return []
132
133 combined: list[cross.Token] = []
134 buffer: list[str] = []
135
136 def flush_buffer():
137 if buffer:
138 merged = ''.join(buffer)
139 combined.append(cross.TextToken(text=merged))
140 buffer.clear()
141
142 for token in self.tokens:
143 if isinstance(token, cross.TextToken):
144 buffer.append(token.text)
145 else:
146 flush_buffer()
147 combined.append(token)
148
149 flush_buffer()
150
151 if combined and isinstance(combined[-1], cross.TextToken):
152 if combined[-1].text.endswith('\n\n'):
153 combined[-1] = cross.TextToken(combined[-1].text[:-2])
154 return combined
155
156 def reset(self):
157 """Reset the parser state for reuse."""
158 super().reset()
159 self.tokens = []
160
161 self.mentions = []
162 self.tags = []
163
164 self.in_pre = False
165 self.in_code = False
166
167 self.current_tag_stack = []
168 self.anchor_stack = []
169 self.list_stack = []