social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from html.parser import HTMLParser 2import cross 3 4class HTMLPostTokenizer(HTMLParser): 5 def __init__(self) -> None: 6 super().__init__() 7 self.tokens: list[cross.Token] = [] 8 self.status: dict 9 10 self.mentions: list[tuple[str, str]] 11 self.tags: list[str] 12 13 self.in_pre = False 14 self.in_code = False 15 16 self.current_tag_stack = [] 17 self.list_stack = [] 18 19 self.anchor_stack = [] 20 self.anchor_data = [] 21 22 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 23 attrs_dict = dict(attrs) 24 25 def append_newline(): 26 if self.tokens: 27 last_token = self.tokens[-1] 28 if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'): 29 self.tokens.append(cross.TextToken('\n')) 30 31 if tag == 'br': 32 self.tokens.append(cross.TextToken(' \n')) 33 34 elif tag == 'a': 35 href = attrs_dict.get('href', '') 36 self.anchor_stack.append(href) 37 38 elif tag == 'strong' or tag == 'b': 39 self.tokens.append(cross.TextToken('**')) 40 41 elif tag == 'em' or tag == 'i': 42 self.tokens.append(cross.TextToken('*')) 43 44 elif tag == 'del' or tag == 's': 45 self.tokens.append(cross.TextToken('~~')) 46 47 elif tag == 'code': 48 if not self.in_pre: 49 self.tokens.append(cross.TextToken('`')) 50 self.in_code = True 51 52 elif tag == 'pre': 53 append_newline() 54 self.tokens.append(cross.TextToken('```\n')) 55 self.in_pre = True 56 57 elif tag == 'blockquote': 58 append_newline() 59 self.tokens.append(cross.TextToken('\n> ')) 60 61 elif tag == 'ul': 62 self.list_stack.append('ul') 63 append_newline() 64 65 elif tag == 'ol': 66 self.list_stack.append('ol') 67 append_newline() 68 69 elif tag == 'li': 70 indent = ' ' * (len(self.list_stack) - 1) 71 if self.list_stack and self.list_stack[-1] == 'ul': 72 self.tokens.append(cross.TextToken(f'{indent}- ')) 73 elif self.list_stack and self.list_stack[-1] == 'ol': 74 self.tokens.append(cross.TextToken(f'{indent}1. ')) 75 76 elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: 77 level = int(tag[1]) 78 self.tokens.append(cross.TextToken("\n" + "#" * level + " ")) 79 80 self.current_tag_stack.append(tag) 81 82 def handle_data(self, data: str) -> None: 83 if self.anchor_stack: 84 self.anchor_data.append(data) 85 else: 86 self.tokens.append(cross.TextToken(data)) 87 88 def handle_endtag(self, tag: str) -> None: 89 if not self.current_tag_stack: 90 return 91 92 if tag in self.current_tag_stack: 93 self.current_tag_stack.remove(tag) 94 95 if tag == 'p': 96 self.tokens.append(cross.TextToken('\n\n')) 97 98 elif tag == 'a': 99 href = self.anchor_stack.pop() 100 anchor_data = ''.join(self.anchor_data) 101 self.anchor_data = [] 102 103 if anchor_data.startswith('#'): 104 as_tag = anchor_data[1:].lower() 105 if any(as_tag == block for block in self.tags): 106 self.tokens.append(cross.TagToken(anchor_data[1:])) 107 elif anchor_data.startswith('@'): 108 match = next( 109 (pair for pair in self.mentions if anchor_data in pair), 110 None 111 ) 112 113 if match: 114 self.tokens.append(cross.MentionToken(match[1], '')) 115 else: 116 self.tokens.append(cross.LinkToken(href, anchor_data)) 117 118 elif tag == 'strong' or tag == 'b': 119 self.tokens.append(cross.TextToken('**')) 120 121 elif tag == 'em' or tag == 'i': 122 self.tokens.append(cross.TextToken('*')) 123 124 elif tag == 'del' or tag == 's': 125 self.tokens.append(cross.TextToken('~~')) 126 127 elif tag == 'code': 128 if not self.in_pre and self.in_code: 129 self.tokens.append(cross.TextToken('`')) 130 self.in_code = False 131 132 elif tag == 'pre': 133 self.tokens.append(cross.TextToken('\n```\n')) 134 self.in_pre = False 135 136 elif tag == 'blockquote': 137 self.tokens.append(cross.TextToken('\n')) 138 139 elif tag == 'ul' or tag == 'ol': 140 if self.list_stack: 141 self.list_stack.pop() 142 self.tokens.append(cross.TextToken('\n')) 143 144 elif tag == 'li': 145 self.tokens.append(cross.TextToken('\n')) 146 147 elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: 148 self.tokens.append(cross.TextToken('\n')) 149 150 def get_tokens(self) -> list[cross.Token]: 151 if not self.tokens: 152 return [] 153 154 combined: list[cross.Token] = [] 155 buffer: list[str] = [] 156 157 def flush_buffer(): 158 if buffer: 159 merged = ''.join(buffer) 160 combined.append(cross.TextToken(text=merged)) 161 buffer.clear() 162 163 for token in self.tokens: 164 if isinstance(token, cross.TextToken): 165 buffer.append(token.text) 166 else: 167 flush_buffer() 168 combined.append(token) 169 170 flush_buffer() 171 172 if combined and isinstance(combined[-1], cross.TextToken): 173 if combined[-1].text.endswith('\n\n'): 174 combined[-1] = cross.TextToken(combined[-1].text[:-2]) 175 return combined 176 177 def reset(self): 178 """Reset the parser state for reuse.""" 179 super().reset() 180 self.tokens = [] 181 182 self.mentions = [] 183 self.tags = [] 184 185 self.in_pre = False 186 self.in_code = False 187 188 self.current_tag_stack = [] 189 self.anchor_stack = [] 190 self.list_stack = []