from html.parser import HTMLParser import cross class HTMLPostTokenizer(HTMLParser): def __init__(self) -> None: super().__init__() self.tokens: list[cross.Token] = [] self.mentions: list[tuple[str, str]] self.tags: list[str] self.in_pre = False self.in_code = False self.current_tag_stack = [] self.list_stack = [] self.anchor_stack = [] self.anchor_data = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = dict(attrs) def append_newline(): if self.tokens: last_token = self.tokens[-1] if isinstance( last_token, cross.TextToken ) and not last_token.text.endswith("\n"): self.tokens.append(cross.TextToken("\n")) match tag: case "br": self.tokens.append(cross.TextToken(" \n")) case "a": href = attrs_dict.get("href", "") self.anchor_stack.append(href) case "strong", "b": self.tokens.append(cross.TextToken("**")) case "em", "i": self.tokens.append(cross.TextToken("*")) case "del", "s": self.tokens.append(cross.TextToken("~~")) case "code": if not self.in_pre: self.tokens.append(cross.TextToken("`")) self.in_code = True case "pre": append_newline() self.tokens.append(cross.TextToken("```\n")) self.in_pre = True case "blockquote": append_newline() self.tokens.append(cross.TextToken("> ")) case "ul", "ol": self.list_stack.append(tag) append_newline() case "li": indent = " " * (len(self.list_stack) - 1) if self.list_stack and self.list_stack[-1] == "ul": self.tokens.append(cross.TextToken(f"{indent}- ")) elif self.list_stack and self.list_stack[-1] == "ol": self.tokens.append(cross.TextToken(f"{indent}1. ")) case _: if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: level = int(tag[1]) self.tokens.append(cross.TextToken("\n" + "#" * level + " ")) self.current_tag_stack.append(tag) def handle_data(self, data: str) -> None: if self.anchor_stack: self.anchor_data.append(data) else: self.tokens.append(cross.TextToken(data)) def handle_endtag(self, tag: str) -> None: if not self.current_tag_stack: return if tag in self.current_tag_stack: self.current_tag_stack.remove(tag) match tag: case "p": self.tokens.append(cross.TextToken("\n\n")) case "a": href = self.anchor_stack.pop() anchor_data = "".join(self.anchor_data) self.anchor_data = [] if anchor_data.startswith("#"): as_tag = anchor_data[1:].lower() if any(as_tag == block for block in self.tags): self.tokens.append(cross.TagToken(anchor_data[1:])) elif anchor_data.startswith("@"): match = next( (pair for pair in self.mentions if anchor_data in pair), None ) if match: self.tokens.append(cross.MentionToken(match[1], "")) else: self.tokens.append(cross.LinkToken(href, anchor_data)) case "strong", "b": self.tokens.append(cross.TextToken("**")) case "em", "i": self.tokens.append(cross.TextToken("*")) case "del", "s": self.tokens.append(cross.TextToken("~~")) case "code": if not self.in_pre and self.in_code: self.tokens.append(cross.TextToken("`")) self.in_code = False case "pre": self.tokens.append(cross.TextToken("\n```\n")) self.in_pre = False case "blockquote": self.tokens.append(cross.TextToken("\n")) case "ul", "ol": if self.list_stack: self.list_stack.pop() self.tokens.append(cross.TextToken("\n")) case "li": self.tokens.append(cross.TextToken("\n")) case _: if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.tokens.append(cross.TextToken("\n")) def get_tokens(self) -> list[cross.Token]: if not self.tokens: return [] combined: list[cross.Token] = [] buffer: list[str] = [] def flush_buffer(): if buffer: merged = "".join(buffer) combined.append(cross.TextToken(text=merged)) buffer.clear() for token in self.tokens: if isinstance(token, cross.TextToken): buffer.append(token.text) else: flush_buffer() combined.append(token) flush_buffer() if combined and isinstance(combined[-1], cross.TextToken): if combined[-1].text.endswith("\n\n"): combined[-1] = cross.TextToken(combined[-1].text[:-2]) return combined def reset(self): """Reset the parser state for reuse.""" super().reset() self.tokens = [] self.mentions = [] self.tags = [] self.in_pre = False self.in_code = False self.current_tag_stack = [] self.anchor_stack = [] self.list_stack = []