util/html_util.py at 31a048eee2609ca6de43733088c24d2b2da90fe9 · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util / html_util.py
at 31a048eee2609ca6de43733088c24d2b2da90fe9 6.3 kB view raw
  1from html.parser import HTMLParser
  2import cross
  3
  4class HTMLPostTokenizer(HTMLParser):
  5    def __init__(self) -> None:
  6        super().__init__()
  7        self.tokens: list[cross.Token] = []
  8        self.status: dict
  9        
 10        self.mentions: list[tuple[str, str]]
 11        self.tags: list[str]
 12        
 13        self.in_pre = False
 14        self.in_code = False
 15        
 16        self.current_tag_stack = []
 17        self.list_stack = []
 18        
 19        self.anchor_stack = []
 20        self.anchor_data = []
 21    
 22    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
 23        attrs_dict = dict(attrs)
 24        
 25        def append_newline():
 26            if self.tokens:
 27                last_token = self.tokens[-1]
 28                if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
 29                    self.tokens.append(cross.TextToken('\n'))
 30        
 31        if tag == 'br':
 32            self.tokens.append(cross.TextToken('  \n'))
 33        
 34        elif tag == 'a':
 35            href = attrs_dict.get('href', '')
 36            self.anchor_stack.append(href)
 37        
 38        elif tag == 'strong' or tag == 'b':
 39            self.tokens.append(cross.TextToken('**'))
 40        
 41        elif tag == 'em' or tag == 'i':
 42            self.tokens.append(cross.TextToken('*'))
 43            
 44        elif tag == 'del' or tag == 's':
 45            self.tokens.append(cross.TextToken('~~'))
 46            
 47        elif tag == 'code':
 48            if not self.in_pre:
 49                self.tokens.append(cross.TextToken('`'))
 50                self.in_code = True
 51        
 52        elif tag == 'pre':
 53            append_newline()
 54            self.tokens.append(cross.TextToken('```\n'))
 55            self.in_pre = True
 56                
 57        elif tag == 'blockquote':
 58            append_newline()
 59            self.tokens.append(cross.TextToken('\n> '))
 60        
 61        elif tag == 'ul':
 62            self.list_stack.append('ul')
 63            append_newline()
 64            
 65        elif tag == 'ol':
 66            self.list_stack.append('ol')
 67            append_newline()
 68        
 69        elif tag == 'li':
 70            indent = '  ' * (len(self.list_stack) - 1)
 71            if self.list_stack and self.list_stack[-1] == 'ul':
 72                self.tokens.append(cross.TextToken(f'{indent}- '))
 73            elif self.list_stack and self.list_stack[-1] == 'ol':
 74                self.tokens.append(cross.TextToken(f'{indent}1. '))
 75        
 76        elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
 77            level = int(tag[1])
 78            self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
 79        
 80        self.current_tag_stack.append(tag)
 81    
 82    def handle_data(self, data: str) -> None:
 83        if self.anchor_stack:
 84            self.anchor_data.append(data)
 85        else:
 86            self.tokens.append(cross.TextToken(data))
 87    
 88    def handle_endtag(self, tag: str) -> None:
 89        if not self.current_tag_stack:
 90            return
 91        
 92        if tag in self.current_tag_stack:
 93            self.current_tag_stack.remove(tag)
 94        
 95        if tag == 'p':
 96            self.tokens.append(cross.TextToken('\n\n'))
 97            
 98        elif tag == 'a':
 99            href = self.anchor_stack.pop()
100            anchor_data = ''.join(self.anchor_data)
101            self.anchor_data = []
102            
103            if anchor_data.startswith('#'):
104                as_tag = anchor_data[1:].lower()
105                if any(as_tag == block for block in self.tags):
106                    self.tokens.append(cross.TagToken(anchor_data[1:]))
107            elif anchor_data.startswith('@'):
108                match = next(
109                    (pair for pair in self.mentions if anchor_data in pair),
110                    None
111                )
112                
113                if match:
114                    self.tokens.append(cross.MentionToken(match[1], ''))
115            else:
116                self.tokens.append(cross.LinkToken(href, anchor_data))
117        
118        elif tag == 'strong' or tag == 'b':
119            self.tokens.append(cross.TextToken('**'))
120        
121        elif tag == 'em' or tag == 'i':
122            self.tokens.append(cross.TextToken('*'))
123        
124        elif tag == 'del' or tag == 's':
125            self.tokens.append(cross.TextToken('~~'))
126        
127        elif tag == 'code':
128            if not self.in_pre and self.in_code:
129                self.tokens.append(cross.TextToken('`'))
130                self.in_code = False
131        
132        elif tag == 'pre':
133            self.tokens.append(cross.TextToken('\n```\n'))
134            self.in_pre = False
135
136        elif tag == 'blockquote':
137            self.tokens.append(cross.TextToken('\n'))
138        
139        elif tag == 'ul' or tag == 'ol':
140            if self.list_stack:
141                self.list_stack.pop()
142            self.tokens.append(cross.TextToken('\n'))
143        
144        elif tag == 'li':
145            self.tokens.append(cross.TextToken('\n'))
146        
147        elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
148            self.tokens.append(cross.TextToken('\n'))
149    
150    def get_tokens(self) -> list[cross.Token]:
151        if not self.tokens:
152            return []
153        
154        combined: list[cross.Token] = []
155        buffer: list[str] = []
156        
157        def flush_buffer():
158            if buffer:
159                merged = ''.join(buffer)
160                combined.append(cross.TextToken(text=merged))
161                buffer.clear()
162
163        for token in self.tokens:
164            if isinstance(token, cross.TextToken):
165                buffer.append(token.text)
166            else:
167                flush_buffer()
168                combined.append(token)
169                
170        flush_buffer()
171        
172        if combined and isinstance(combined[-1], cross.TextToken):
173            if combined[-1].text.endswith('\n\n'):
174                combined[-1] = cross.TextToken(combined[-1].text[:-2])
175        return combined
176    
177    def reset(self):
178        """Reset the parser state for reuse."""
179        super().reset()
180        self.tokens = []
181        
182        self.mentions = []
183        self.tags = []
184        
185        self.in_pre = False
186        self.in_code = False
187        
188        self.current_tag_stack = []
189        self.anchor_stack = []
190        self.list_stack = []