social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at master 6.0 kB view raw
1from html.parser import HTMLParser 2 3import cross 4 5 6class HTMLPostTokenizer(HTMLParser): 7 def __init__(self) -> None: 8 super().__init__() 9 self.tokens: list[cross.Token] = [] 10 11 self.mentions: list[tuple[str, str]] 12 self.tags: list[str] 13 14 self.in_pre = False 15 self.in_code = False 16 17 self.current_tag_stack = [] 18 self.list_stack = [] 19 20 self.anchor_stack = [] 21 self.anchor_data = [] 22 23 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 24 attrs_dict = dict(attrs) 25 26 def append_newline(): 27 if self.tokens: 28 last_token = self.tokens[-1] 29 if isinstance( 30 last_token, cross.TextToken 31 ) and not last_token.text.endswith("\n"): 32 self.tokens.append(cross.TextToken("\n")) 33 34 match tag: 35 case "br": 36 self.tokens.append(cross.TextToken(" \n")) 37 case "a": 38 href = attrs_dict.get("href", "") 39 self.anchor_stack.append(href) 40 case "strong", "b": 41 self.tokens.append(cross.TextToken("**")) 42 case "em", "i": 43 self.tokens.append(cross.TextToken("*")) 44 case "del", "s": 45 self.tokens.append(cross.TextToken("~~")) 46 case "code": 47 if not self.in_pre: 48 self.tokens.append(cross.TextToken("`")) 49 self.in_code = True 50 case "pre": 51 append_newline() 52 self.tokens.append(cross.TextToken("```\n")) 53 self.in_pre = True 54 case "blockquote": 55 append_newline() 56 self.tokens.append(cross.TextToken("> ")) 57 case "ul", "ol": 58 self.list_stack.append(tag) 59 append_newline() 60 case "li": 61 indent = " " * (len(self.list_stack) - 1) 62 if self.list_stack and self.list_stack[-1] == "ul": 63 self.tokens.append(cross.TextToken(f"{indent}- ")) 64 elif self.list_stack and self.list_stack[-1] == "ol": 65 self.tokens.append(cross.TextToken(f"{indent}1. ")) 66 case _: 67 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: 68 level = int(tag[1]) 69 self.tokens.append(cross.TextToken("\n" + "#" * level + " ")) 70 71 self.current_tag_stack.append(tag) 72 73 def handle_data(self, data: str) -> None: 74 if self.anchor_stack: 75 self.anchor_data.append(data) 76 else: 77 self.tokens.append(cross.TextToken(data)) 78 79 def handle_endtag(self, tag: str) -> None: 80 if not self.current_tag_stack: 81 return 82 83 if tag in self.current_tag_stack: 84 self.current_tag_stack.remove(tag) 85 86 match tag: 87 case "p": 88 self.tokens.append(cross.TextToken("\n\n")) 89 case "a": 90 href = self.anchor_stack.pop() 91 anchor_data = "".join(self.anchor_data) 92 self.anchor_data = [] 93 94 if anchor_data.startswith("#"): 95 as_tag = anchor_data[1:].lower() 96 if any(as_tag == block for block in self.tags): 97 self.tokens.append(cross.TagToken(anchor_data[1:])) 98 elif anchor_data.startswith("@"): 99 match = next( 100 (pair for pair in self.mentions if anchor_data in pair), None 101 ) 102 103 if match: 104 self.tokens.append(cross.MentionToken(match[1], "")) 105 else: 106 self.tokens.append(cross.LinkToken(href, anchor_data)) 107 case "strong", "b": 108 self.tokens.append(cross.TextToken("**")) 109 case "em", "i": 110 self.tokens.append(cross.TextToken("*")) 111 case "del", "s": 112 self.tokens.append(cross.TextToken("~~")) 113 case "code": 114 if not self.in_pre and self.in_code: 115 self.tokens.append(cross.TextToken("`")) 116 self.in_code = False 117 case "pre": 118 self.tokens.append(cross.TextToken("\n```\n")) 119 self.in_pre = False 120 case "blockquote": 121 self.tokens.append(cross.TextToken("\n")) 122 case "ul", "ol": 123 if self.list_stack: 124 self.list_stack.pop() 125 self.tokens.append(cross.TextToken("\n")) 126 case "li": 127 self.tokens.append(cross.TextToken("\n")) 128 case _: 129 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: 130 self.tokens.append(cross.TextToken("\n")) 131 132 def get_tokens(self) -> list[cross.Token]: 133 if not self.tokens: 134 return [] 135 136 combined: list[cross.Token] = [] 137 buffer: list[str] = [] 138 139 def flush_buffer(): 140 if buffer: 141 merged = "".join(buffer) 142 combined.append(cross.TextToken(text=merged)) 143 buffer.clear() 144 145 for token in self.tokens: 146 if isinstance(token, cross.TextToken): 147 buffer.append(token.text) 148 else: 149 flush_buffer() 150 combined.append(token) 151 152 flush_buffer() 153 154 if combined and isinstance(combined[-1], cross.TextToken): 155 if combined[-1].text.endswith("\n\n"): 156 combined[-1] = cross.TextToken(combined[-1].text[:-2]) 157 return combined 158 159 def reset(self): 160 """Reset the parser state for reuse.""" 161 super().reset() 162 self.tokens = [] 163 164 self.mentions = [] 165 self.tags = [] 166 167 self.in_pre = False 168 self.in_code = False 169 170 self.current_tag_stack = [] 171 self.anchor_stack = [] 172 self.list_stack = []