util/html_util.py at b93de3de9f7053b2b26e733603c5877138bc87e2 · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util / html_util.py
at b93de3de9f7053b2b26e733603c5877138bc87e2 6.2 kB view raw
  1from html.parser import HTMLParser
  2import cross
  3
  4class HTMLPostTokenizer(HTMLParser):
  5    def __init__(self) -> None:
  6        super().__init__()
  7        self.tokens: list[cross.Token] = []
  8        
  9        self.mentions: list[tuple[str, str]]
 10        self.tags: list[str]
 11        
 12        self.in_pre = False
 13        self.in_code = False
 14        
 15        self.current_tag_stack = []
 16        self.list_stack = []
 17        
 18        self.anchor_stack = []
 19        self.anchor_data = []
 20    
 21    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
 22        attrs_dict = dict(attrs)
 23        
 24        def append_newline():
 25            if self.tokens:
 26                last_token = self.tokens[-1]
 27                if isinstance(last_token, cross.TextToken) and not last_token.text.endswith('\n'):
 28                    self.tokens.append(cross.TextToken('\n'))
 29        
 30        match tag:
 31            case 'br':
 32                self.tokens.append(cross.TextToken('  \n'))
 33            case 'a':
 34                href = attrs_dict.get('href', '')
 35                self.anchor_stack.append(href)
 36            case 'strong', 'b':
 37                self.tokens.append(cross.TextToken('**'))
 38            case 'em', 'i':
 39                self.tokens.append(cross.TextToken('*'))
 40            case 'del', 's':
 41                self.tokens.append(cross.TextToken('~~'))
 42            case 'code':
 43                if not self.in_pre:
 44                    self.tokens.append(cross.TextToken('`'))
 45                    self.in_code = True
 46            case 'pre':
 47                append_newline()
 48                self.tokens.append(cross.TextToken('```\n'))
 49                self.in_pre = True
 50            case 'blockquote':
 51                append_newline()
 52                self.tokens.append(cross.TextToken('> '))
 53            case 'ul', 'ol':
 54                self.list_stack.append(tag)
 55                append_newline()
 56            case 'li':
 57                indent = '  ' * (len(self.list_stack) - 1)
 58                if self.list_stack and self.list_stack[-1] == 'ul':
 59                    self.tokens.append(cross.TextToken(f'{indent}- '))
 60                elif self.list_stack and self.list_stack[-1] == 'ol':
 61                    self.tokens.append(cross.TextToken(f'{indent}1. '))
 62            case _:
 63                if tag in {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
 64                    level = int(tag[1])
 65                    self.tokens.append(cross.TextToken("\n" + "#" * level + " "))
 66        
 67        self.current_tag_stack.append(tag)
 68    
 69    def handle_data(self, data: str) -> None:
 70        if self.anchor_stack:
 71            self.anchor_data.append(data)
 72        else:
 73            self.tokens.append(cross.TextToken(data))
 74    
 75    def handle_endtag(self, tag: str) -> None:
 76        if not self.current_tag_stack:
 77            return
 78        
 79        if tag in self.current_tag_stack:
 80            self.current_tag_stack.remove(tag)
 81        
 82        match tag:
 83            case 'p':
 84                self.tokens.append(cross.TextToken('\n\n'))
 85            case 'a':
 86                href = self.anchor_stack.pop()
 87                anchor_data = ''.join(self.anchor_data)
 88                self.anchor_data = []
 89            
 90                if anchor_data.startswith('#'):
 91                    as_tag = anchor_data[1:].lower()
 92                    if any(as_tag == block for block in self.tags):
 93                        self.tokens.append(cross.TagToken(anchor_data[1:]))
 94                elif anchor_data.startswith('@'):
 95                    match = next(
 96                       (pair for pair in self.mentions if anchor_data in pair),
 97                        None
 98                    )
 99                
100                    if match:
101                        self.tokens.append(cross.MentionToken(match[1], ''))
102                else:
103                    self.tokens.append(cross.LinkToken(href, anchor_data))
104            case 'strong', 'b':
105                self.tokens.append(cross.TextToken('**'))
106            case 'em', 'i':
107                self.tokens.append(cross.TextToken('*'))
108            case 'del', 's':
109                self.tokens.append(cross.TextToken('~~'))
110            case 'code':
111                if not self.in_pre and self.in_code:
112                    self.tokens.append(cross.TextToken('`'))
113                    self.in_code = False
114            case 'pre':
115                self.tokens.append(cross.TextToken('\n```\n'))
116                self.in_pre = False
117            case 'blockquote':
118                self.tokens.append(cross.TextToken('\n'))
119            case 'ul', 'ol':
120                if self.list_stack:
121                    self.list_stack.pop()
122                self.tokens.append(cross.TextToken('\n'))
123            case 'li':
124                self.tokens.append(cross.TextToken('\n'))
125            case _:
126                if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
127                    self.tokens.append(cross.TextToken('\n'))
128    
129    def get_tokens(self) -> list[cross.Token]:
130        if not self.tokens:
131            return []
132        
133        combined: list[cross.Token] = []
134        buffer: list[str] = []
135        
136        def flush_buffer():
137            if buffer:
138                merged = ''.join(buffer)
139                combined.append(cross.TextToken(text=merged))
140                buffer.clear()
141
142        for token in self.tokens:
143            if isinstance(token, cross.TextToken):
144                buffer.append(token.text)
145            else:
146                flush_buffer()
147                combined.append(token)
148                
149        flush_buffer()
150        
151        if combined and isinstance(combined[-1], cross.TextToken):
152            if combined[-1].text.endswith('\n\n'):
153                combined[-1] = cross.TextToken(combined[-1].text[:-2])
154        return combined
155    
156    def reset(self):
157        """Reset the parser state for reuse."""
158        super().reset()
159        self.tokens = []
160        
161        self.mentions = []
162        self.tags = []
163        
164        self.in_pre = False
165        self.in_code = False
166        
167        self.current_tag_stack = []
168        self.anchor_stack = []
169        self.list_stack = []