from html.parser import HTMLParser import cross class HTMLPostTokenizer(HTMLParser): def __init__(self) -> None: super().__init__() self.tokens: list[cross.Token] = [] self.mentions: list[tuple[str, str]] self.tags: list[str] self.in_pre = False self.in_code = False self.current_tag_stack = [] self.list_stack = [] self.anchor_stack = [] self.anchor_data = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = dict(attrs) def append_newline(): if self.tokens: last_token = self.tokens[-1] if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'): self.tokens.append(cross.TextToken('\n')) match tag: case 'br': self.tokens.append(cross.TextToken(' \n')) case 'a': href = attrs_dict.get('href', '') self.anchor_stack.append(href) case 'strong', 'b': self.tokens.append(cross.TextToken('**')) case 'em', 'i': self.tokens.append(cross.TextToken('*')) case 'del', 's': self.tokens.append(cross.TextToken('~~')) case 'code': if not self.in_pre: self.tokens.append(cross.TextToken('`')) self.in_code = True case 'pre': append_newline() self.tokens.append(cross.TextToken('```\n')) self.in_pre = True case 'blockquote': append_newline() self.tokens.append(cross.TextToken('> ')) case 'ul', 'ol': self.list_stack.append(tag) append_newline() case 'li': indent = ' ' * (len(self.list_stack) - 1) if self.list_stack and self.list_stack[-1] == 'ul': self.tokens.append(cross.TextToken(f'{indent}- ')) elif self.list_stack and self.list_stack[-1] == 'ol': self.tokens.append(cross.TextToken(f'{indent}1. ')) case _: if tag in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: level = int(tag[1]) self.tokens.append(cross.TextToken("\n" + "#" * level + " ")) self.current_tag_stack.append(tag) def handle_data(self, data: str) -> None: if self.anchor_stack: self.anchor_data.append(data) else: self.tokens.append(cross.TextToken(data)) def handle_endtag(self, tag: str) -> None: if not self.current_tag_stack: return if tag in self.current_tag_stack: self.current_tag_stack.remove(tag) match tag: case 'p': self.tokens.append(cross.TextToken('\n\n')) case 'a': href = self.anchor_stack.pop() anchor_data = ''.join(self.anchor_data) self.anchor_data = [] if anchor_data.startswith('#'): as_tag = anchor_data[1:].lower() if any(as_tag == block for block in self.tags): self.tokens.append(cross.TagToken(anchor_data[1:])) elif anchor_data.startswith('@'): match = next( (pair for pair in self.mentions if anchor_data in pair), None ) if match: self.tokens.append(cross.MentionToken(match[1], '')) else: self.tokens.append(cross.LinkToken(href, anchor_data)) case 'strong', 'b': self.tokens.append(cross.TextToken('**')) case 'em', 'i': self.tokens.append(cross.TextToken('*')) case 'del', 's': self.tokens.append(cross.TextToken('~~')) case 'code': if not self.in_pre and self.in_code: self.tokens.append(cross.TextToken('`')) self.in_code = False case 'pre': self.tokens.append(cross.TextToken('\n```\n')) self.in_pre = False case 'blockquote': self.tokens.append(cross.TextToken('\n')) case 'ul', 'ol': if self.list_stack: self.list_stack.pop() self.tokens.append(cross.TextToken('\n')) case 'li': self.tokens.append(cross.TextToken('\n')) case _: if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: self.tokens.append(cross.TextToken('\n')) def get_tokens(self) -> list[cross.Token]: if not self.tokens: return [] combined: list[cross.Token] = [] buffer: list[str] = [] def flush_buffer(): if buffer: merged = ''.join(buffer) combined.append(cross.TextToken(text=merged)) buffer.clear() for token in self.tokens: if isinstance(token, cross.TextToken): buffer.append(token.text) else: flush_buffer() combined.append(token) flush_buffer() if combined and isinstance(combined[-1], cross.TextToken): if combined[-1].text.endswith('\n\n'): combined[-1] = cross.TextToken(combined[-1].text[:-2]) return combined def reset(self): """Reset the parser state for reuse.""" super().reset() self.tokens = [] self.mentions = [] self.tags = [] self.in_pre = False self.in_code = False self.current_tag_stack = [] self.anchor_stack = [] self.list_stack = []