···
1
+
from html.parser import HTMLParser
4
+
class HTMLPostTokenizer(HTMLParser):
5
+
def __init__(self) -> None:
7
+
self.tokens: list[cross.Token] = []
11
+
self.in_code = False
13
+
self.current_tag_stack = []
14
+
self.list_stack = []
16
+
self.anchor_stack = []
17
+
self.anchor_data = []
19
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
20
+
attrs_dict = dict(attrs)
23
+
self.tokens.append(cross.TextToken(' \n'))
26
+
href = attrs_dict.get('href', '')
27
+
self.anchor_stack.append(href)
29
+
elif tag == 'strong' or tag == 'b':
30
+
self.tokens.append(cross.TextToken('**'))
32
+
elif tag == 'em' or tag == 'i':
33
+
self.tokens.append(cross.TextToken('*'))
35
+
elif tag == 'del' or tag == 's':
36
+
self.tokens.append(cross.TextToken('~~'))
40
+
self.tokens.append(cross.TextToken('`'))
45
+
last_token = self.tokens[-1]
46
+
if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
47
+
self.tokens.append(cross.TextToken('\n'))
49
+
self.tokens.append(cross.TextToken('```\n'))
52
+
elif tag == 'blockquote':
54
+
last_token = self.tokens[-1]
55
+
if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
56
+
self.tokens.append(cross.TextToken('\n'))
58
+
self.tokens.append(cross.TextToken('\n> '))
61
+
self.list_stack.append('ul')
62
+
self.tokens.append(cross.TextToken('\n'))
65
+
self.list_stack.append('ol')
66
+
self.tokens.append(cross.TextToken('\n'))
69
+
indent = ' ' * (len(self.list_stack) - 1)
70
+
if self.list_stack and self.list_stack[-1] == 'ul':
71
+
self.tokens.append(cross.TextToken(f'{indent}- '))
72
+
elif self.list_stack and self.list_stack[-1] == 'ol':
73
+
self.tokens.append(cross.TextToken(f'{indent}1. '))
75
+
elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
77
+
self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
79
+
self.current_tag_stack.append(tag)
81
+
def handle_data(self, data: str) -> None:
82
+
if self.anchor_stack:
83
+
self.anchor_data.append(data)
85
+
self.tokens.append(cross.TextToken(data))
87
+
def handle_endtag(self, tag: str) -> None:
88
+
if not self.current_tag_stack:
91
+
if tag in self.current_tag_stack:
92
+
self.current_tag_stack.remove(tag)
95
+
self.tokens.append(cross.TextToken('\n\n'))
98
+
href = self.anchor_stack.pop()
99
+
anchor_data = ''.join(self.anchor_data)
101
+
if anchor_data.startswith('#'):
102
+
tags: list[dict] = self.status.get('tags', [])
104
+
as_tag = anchor_data[1:].lower()
105
+
if any(as_tag == block.get('name') for block in tags):
106
+
self.tokens.append(cross.TagToken(as_tag))
107
+
elif anchor_data.startswith('@'):
108
+
mentions: list[dict] = self.status.get('mentions', [])
110
+
as_mention = anchor_data[1:]
111
+
for block in mentions:
112
+
if href == block.get('url'):
113
+
self.tokens.append(cross.MentionToken(block['acct'], block['url']))
115
+
elif as_mention == block.get('acct') or as_mention == block.get('username'):
116
+
self.tokens.append(cross.MentionToken(block['acct'], block['url']))
119
+
self.tokens.append(cross.LinkToken(href, anchor_data))
121
+
elif tag == 'strong' or tag == 'b':
122
+
self.tokens.append(cross.TextToken('**'))
124
+
elif tag == 'em' or tag == 'i':
125
+
self.tokens.append(cross.TextToken('*'))
127
+
elif tag == 'del' or tag == 's':
128
+
self.tokens.append(cross.TextToken('~~'))
130
+
elif tag == 'code':
131
+
if not self.in_pre and self.in_code:
132
+
self.tokens.append(cross.TextToken('`'))
133
+
self.in_code = False
136
+
self.tokens.append(cross.TextToken('\n```\n'))
137
+
self.in_pre = False
139
+
elif tag == 'blockquote':
140
+
self.tokens.append(cross.TextToken('\n'))
142
+
elif tag == 'ul' or tag == 'ol':
143
+
if self.list_stack:
144
+
self.list_stack.pop()
145
+
self.tokens.append(cross.TextToken('\n'))
148
+
self.tokens.append(cross.TextToken('\n'))
150
+
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
151
+
self.tokens.append(cross.TextToken('\n'))
153
+
def get_tokens(self) -> list[cross.Token]:
154
+
if not self.tokens:
157
+
combined: list[cross.Token] = []
158
+
buffer: list[str] = []
160
+
def flush_buffer():
162
+
merged = ''.join(buffer)
163
+
combined.append(cross.TextToken(text=merged))
166
+
for token in self.tokens:
167
+
if isinstance(token, cross.TextToken):
168
+
buffer.append(token.text)
171
+
combined.append(token)
175
+
if combined and isinstance(combined[-1], cross.TextToken):
176
+
if combined[-1].text.endswith('\n\n'):
177
+
combined[-1] = cross.TextToken(combined[-1].text[:-2])
181
+
"""Reset the parser state for reuse."""
186
+
self.in_pre = False
187
+
self.in_code = False
189
+
self.current_tag_stack = []
190
+
self.anchor_stack = []
191
+
self.list_stack = []