from html.parser import HTMLParser
import cross
class HTMLPostTokenizer(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.tokens: list[cross.Token] = []
self.status: dict
self.mentions: list[tuple[str, str]]
self.tags: list[str]
self.in_pre = False
self.in_code = False
self.current_tag_stack = []
self.list_stack = []
self.anchor_stack = []
self.anchor_data = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
attrs_dict = dict(attrs)
def append_newline():
if self.tokens:
last_token = self.tokens[-1]
if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
self.tokens.append(cross.TextToken('\n'))
if tag == 'br':
self.tokens.append(cross.TextToken(' \n'))
elif tag == 'a':
href = attrs_dict.get('href', '')
self.anchor_stack.append(href)
elif tag == 'strong' or tag == 'b':
self.tokens.append(cross.TextToken('**'))
elif tag == 'em' or tag == 'i':
self.tokens.append(cross.TextToken('*'))
elif tag == 'del' or tag == 's':
self.tokens.append(cross.TextToken('~~'))
elif tag == 'code':
if not self.in_pre:
self.tokens.append(cross.TextToken('`'))
self.in_code = True
elif tag == 'pre':
append_newline()
self.tokens.append(cross.TextToken('```\n'))
self.in_pre = True
elif tag == 'blockquote':
append_newline()
self.tokens.append(cross.TextToken('\n> '))
elif tag == 'ul':
self.list_stack.append('ul')
append_newline()
elif tag == 'ol':
self.list_stack.append('ol')
append_newline()
elif tag == 'li':
indent = ' ' * (len(self.list_stack) - 1)
if self.list_stack and self.list_stack[-1] == 'ul':
self.tokens.append(cross.TextToken(f'{indent}- '))
elif self.list_stack and self.list_stack[-1] == 'ol':
self.tokens.append(cross.TextToken(f'{indent}1. '))
elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
level = int(tag[1])
self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
self.current_tag_stack.append(tag)
def handle_data(self, data: str) -> None:
if self.anchor_stack:
self.anchor_data.append(data)
else:
self.tokens.append(cross.TextToken(data))
def handle_endtag(self, tag: str) -> None:
if not self.current_tag_stack:
return
if tag in self.current_tag_stack:
self.current_tag_stack.remove(tag)
if tag == 'p':
self.tokens.append(cross.TextToken('\n\n'))
elif tag == 'a':
href = self.anchor_stack.pop()
anchor_data = ''.join(self.anchor_data)
self.anchor_data = []
if anchor_data.startswith('#'):
as_tag = anchor_data[1:].lower()
if any(as_tag == block for block in self.tags):
self.tokens.append(cross.TagToken(anchor_data[1:]))
elif anchor_data.startswith('@'):
match = next(
(pair for pair in self.mentions if anchor_data in pair),
None
)
if match:
self.tokens.append(cross.MentionToken(match[1], ''))
else:
self.tokens.append(cross.LinkToken(href, anchor_data))
elif tag == 'strong' or tag == 'b':
self.tokens.append(cross.TextToken('**'))
elif tag == 'em' or tag == 'i':
self.tokens.append(cross.TextToken('*'))
elif tag == 'del' or tag == 's':
self.tokens.append(cross.TextToken('~~'))
elif tag == 'code':
if not self.in_pre and self.in_code:
self.tokens.append(cross.TextToken('`'))
self.in_code = False
elif tag == 'pre':
self.tokens.append(cross.TextToken('\n```\n'))
self.in_pre = False
elif tag == 'blockquote':
self.tokens.append(cross.TextToken('\n'))
elif tag == 'ul' or tag == 'ol':
if self.list_stack:
self.list_stack.pop()
self.tokens.append(cross.TextToken('\n'))
elif tag == 'li':
self.tokens.append(cross.TextToken('\n'))
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self.tokens.append(cross.TextToken('\n'))
def get_tokens(self) -> list[cross.Token]:
if not self.tokens:
return []
combined: list[cross.Token] = []
buffer: list[str] = []
def flush_buffer():
if buffer:
merged = ''.join(buffer)
combined.append(cross.TextToken(text=merged))
buffer.clear()
for token in self.tokens:
if isinstance(token, cross.TextToken):
buffer.append(token.text)
else:
flush_buffer()
combined.append(token)
flush_buffer()
if combined and isinstance(combined[-1], cross.TextToken):
if combined[-1].text.endswith('\n\n'):
combined[-1] = cross.TextToken(combined[-1].text[:-2])
return combined
def reset(self):
"""Reset the parser state for reuse."""
super().reset()
self.tokens = []
self.mentions = []
self.tags = []
self.in_pre = False
self.in_code = False
self.current_tag_stack = []
self.anchor_stack = []
self.list_stack = []