social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from html.parser import HTMLParser 2import cross 3 4class HTMLPostTokenizer(HTMLParser): 5 def __init__(self) -> None: 6 super().__init__() 7 self.tokens: list[cross.Token] = [] 8 9 self.mentions: list[tuple[str, str]] 10 self.tags: list[str] 11 12 self.in_pre = False 13 self.in_code = False 14 15 self.current_tag_stack = [] 16 self.list_stack = [] 17 18 self.anchor_stack = [] 19 self.anchor_data = [] 20 21 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 22 attrs_dict = dict(attrs) 23 24 def append_newline(): 25 if self.tokens: 26 last_token = self.tokens[-1] 27 if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'): 28 self.tokens.append(cross.TextToken('\n')) 29 30 match tag: 31 case 'br': 32 self.tokens.append(cross.TextToken(' \n')) 33 case 'a': 34 href = attrs_dict.get('href', '') 35 self.anchor_stack.append(href) 36 case 'strong', 'b': 37 self.tokens.append(cross.TextToken('**')) 38 case 'em', 'i': 39 self.tokens.append(cross.TextToken('*')) 40 case 'del', 's': 41 self.tokens.append(cross.TextToken('~~')) 42 case 'code': 43 if not self.in_pre: 44 self.tokens.append(cross.TextToken('`')) 45 self.in_code = True 46 case 'pre': 47 append_newline() 48 self.tokens.append(cross.TextToken('```\n')) 49 self.in_pre = True 50 case 'blockquote': 51 append_newline() 52 self.tokens.append(cross.TextToken('> ')) 53 case 'ul', 'ol': 54 self.list_stack.append(tag) 55 append_newline() 56 case 'li': 57 indent = ' ' * (len(self.list_stack) - 1) 58 if self.list_stack and self.list_stack[-1] == 'ul': 59 self.tokens.append(cross.TextToken(f'{indent}- ')) 60 elif self.list_stack and self.list_stack[-1] == 'ol': 61 self.tokens.append(cross.TextToken(f'{indent}1. ')) 62 case _: 63 if tag in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: 64 level = int(tag[1]) 65 self.tokens.append(cross.TextToken("\n" + "#" * level + " ")) 66 67 self.current_tag_stack.append(tag) 68 69 def handle_data(self, data: str) -> None: 70 if self.anchor_stack: 71 self.anchor_data.append(data) 72 else: 73 self.tokens.append(cross.TextToken(data)) 74 75 def handle_endtag(self, tag: str) -> None: 76 if not self.current_tag_stack: 77 return 78 79 if tag in self.current_tag_stack: 80 self.current_tag_stack.remove(tag) 81 82 match tag: 83 case 'p': 84 self.tokens.append(cross.TextToken('\n\n')) 85 case 'a': 86 href = self.anchor_stack.pop() 87 anchor_data = ''.join(self.anchor_data) 88 self.anchor_data = [] 89 90 if anchor_data.startswith('#'): 91 as_tag = anchor_data[1:].lower() 92 if any(as_tag == block for block in self.tags): 93 self.tokens.append(cross.TagToken(anchor_data[1:])) 94 elif anchor_data.startswith('@'): 95 match = next( 96 (pair for pair in self.mentions if anchor_data in pair), 97 None 98 ) 99 100 if match: 101 self.tokens.append(cross.MentionToken(match[1], '')) 102 else: 103 self.tokens.append(cross.LinkToken(href, anchor_data)) 104 case 'strong', 'b': 105 self.tokens.append(cross.TextToken('**')) 106 case 'em', 'i': 107 self.tokens.append(cross.TextToken('*')) 108 case 'del', 's': 109 self.tokens.append(cross.TextToken('~~')) 110 case 'code': 111 if not self.in_pre and self.in_code: 112 self.tokens.append(cross.TextToken('`')) 113 self.in_code = False 114 case 'pre': 115 self.tokens.append(cross.TextToken('\n```\n')) 116 self.in_pre = False 117 case 'blockquote': 118 self.tokens.append(cross.TextToken('\n')) 119 case 'ul', 'ol': 120 if self.list_stack: 121 self.list_stack.pop() 122 self.tokens.append(cross.TextToken('\n')) 123 case 'li': 124 self.tokens.append(cross.TextToken('\n')) 125 case _: 126 if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: 127 self.tokens.append(cross.TextToken('\n')) 128 129 def get_tokens(self) -> list[cross.Token]: 130 if not self.tokens: 131 return [] 132 133 combined: list[cross.Token] = [] 134 buffer: list[str] = [] 135 136 def flush_buffer(): 137 if buffer: 138 merged = ''.join(buffer) 139 combined.append(cross.TextToken(text=merged)) 140 buffer.clear() 141 142 for token in self.tokens: 143 if isinstance(token, cross.TextToken): 144 buffer.append(token.text) 145 else: 146 flush_buffer() 147 combined.append(token) 148 149 flush_buffer() 150 151 if combined and isinstance(combined[-1], cross.TextToken): 152 if combined[-1].text.endswith('\n\n'): 153 combined[-1] = cross.TextToken(combined[-1].text[:-2]) 154 return combined 155 156 def reset(self): 157 """Reset the parser state for reuse.""" 158 super().reset() 159 self.tokens = [] 160 161 self.mentions = [] 162 self.tags = [] 163 164 self.in_pre = False 165 self.in_code = False 166 167 self.current_tag_stack = [] 168 self.anchor_stack = [] 169 self.list_stack = []