social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at next 4.8 kB view raw
1from html.parser import HTMLParser 2from typing import override 3 4from cross.tokens import LinkToken, TextToken, Token 5from util.splitter import canonical_label 6 7 8class HTMLToTokensParser(HTMLParser): 9 def __init__(self) -> None: 10 super().__init__() 11 self.tokens: list[Token] = [] 12 13 self._tag_stack: dict[str, tuple[str, dict[str, str | None]]] = {} 14 self.in_pre: bool = False 15 self.in_code: bool = False 16 self.invisible: bool = False 17 18 def handle_a_endtag(self): 19 label, _attr = self._tag_stack.pop("a") 20 21 href = _attr.get("href") 22 if href: 23 if canonical_label(label, href): 24 self.tokens.append(LinkToken(href=href)) 25 else: 26 self.tokens.append(LinkToken(href=href, label=label)) 27 28 def append_text(self, text: str): 29 self.tokens.append(TextToken(text=text)) 30 31 def append_newline(self): 32 if self.tokens: 33 last_token = self.tokens[-1] 34 if isinstance(last_token, TextToken) and not last_token.text.endswith("\n"): 35 self.tokens.append(TextToken(text="\n")) 36 37 @override 38 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 39 _attr = dict(attrs) 40 41 if self.invisible: 42 return 43 44 match tag: 45 case "p": 46 cls = _attr.get("class", "") 47 if cls and "quote-inline" in cls: 48 self.invisible = True 49 case "a": 50 self._tag_stack["a"] = ("", _attr) 51 case "code": 52 if not self.in_pre: 53 self.append_text("`") 54 self.in_code = True 55 case "pre": 56 self.append_newline() 57 self.append_text("```\n") 58 self.in_pre = True 59 case "blockquote": 60 self.append_newline() 61 self.append_text("> ") 62 case "strong" | "b": 63 self.append_text("**") 64 case "em" | "i": 65 self.append_text("*") 66 case "del" | "s": 67 self.append_text("~~") 68 case "br": 69 self.append_text("\n") 70 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 71 level = int(tag[1]) 72 self.append_text("\n" + "#" * level + " ") 73 case _: 74 # self.builder.extend(f"<{tag}>".encode("utf-8")) 75 pass 76 77 @override 78 def handle_endtag(self, tag: str) -> None: 79 if self.invisible: 80 if tag == "p": 81 self.invisible = False 82 return 83 84 match tag: 85 case "a": 86 if "a" in self._tag_stack: 87 self.handle_a_endtag() 88 case "code": 89 if not self.in_pre and self.in_code: 90 self.append_text("`") 91 self.in_code = False 92 case "pre": 93 self.append_newline() 94 self.append_text("```\n") 95 self.in_pre = False 96 case "blockquote": 97 self.append_text("\n") 98 case "strong" | "b": 99 self.append_text("**") 100 case "em" | "i": 101 self.append_text("*") 102 case "del" | "s": 103 self.append_text("~~") 104 case "p": 105 self.append_text("\n\n") 106 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 107 self.append_text("\n") 108 case _: 109 # self.builder.extend(f"</{tag}>".encode("utf-8")) 110 pass 111 112 @override 113 def handle_data(self, data: str) -> None: 114 if self.invisible: 115 return 116 117 if self._tag_stack.get('a'): 118 label, _attr = self._tag_stack.pop("a") 119 self._tag_stack["a"] = (label + data, _attr) 120 return 121 122 def get_result(self) -> list[Token]: 123 if not self.tokens: 124 return [] 125 126 combined: list[Token] = [] 127 buffer: list[str] = [] 128 129 def flush_buffer(): 130 if buffer: 131 merged = "".join(buffer) 132 combined.append(TextToken(text=merged)) 133 buffer.clear() 134 135 for token in self.tokens: 136 if isinstance(token, TextToken): 137 buffer.append(token.text) 138 else: 139 flush_buffer() 140 combined.append(token) 141 142 flush_buffer() 143 144 if combined and isinstance(combined[-1], TextToken): 145 if combined[-1].text.endswith("\n\n"): 146 combined[-1] = TextToken(text=combined[-1].text[:-2]) 147 148 if combined[-1].text.endswith("\n"): 149 combined[-1] = TextToken(text=combined[-1].text[:-1]) 150 return combined