markeddown.py at ab51a66636c6087228076e8df71a5caec2a08d8f · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / markeddown.py
at ab51a66636c6087228076e8df71a5caec2a08d8f 6.3 kB view raw
  1import re
  2from html.parser import HTMLParser
  3from html import unescape
  4
  5### VIBECODED CODE ALERT!!! ###
  6
  7class HTMLToMarkdownParser(HTMLParser):
  8    def __init__(self):
  9        super().__init__()
 10        self.markdown = []
 11        
 12        self.in_pre = False
 13        self.in_code = False
 14        
 15        self.current_tag_stack = []
 16        self.list_stack = []
 17        
 18        self.table_data = []
 19        self.current_row = []
 20        self.in_table = False
 21        
 22        self.link_stack = []
 23        self.preserve_spaces = False
 24        
 25    def handle_starttag(self, tag, attrs):
 26        attrs_dict = dict(attrs)
 27        
 28        if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
 29            level = int(tag[1])
 30            self.markdown.append("\n" + "#" * level + " ")
 31        elif tag == 'p':
 32            #self.markdown.append('\n\n')
 33            pass
 34        elif tag == 'br':
 35            self.markdown.append('  \n')
 36        elif tag == 'strong' or tag == 'b':
 37            self.markdown.append('**')
 38        elif tag == 'em' or tag == 'i':
 39            self.markdown.append('*')
 40        elif tag == 'code':
 41            if not self.in_pre:
 42                self.markdown.append('`')
 43                self.in_code = True
 44        elif tag == 'pre':
 45            self.markdown.append('\n```\n')
 46            self.in_pre = True
 47        elif tag == 'blockquote':
 48            self.markdown.append('\n> ')
 49        elif tag == 'ul':
 50            self.list_stack.append('ul')
 51            self.markdown.append('\n')
 52        elif tag == 'ol':
 53            self.list_stack.append('ol')
 54            self.markdown.append('\n')
 55        elif tag == 'li':
 56            indent = '  ' * (len(self.list_stack) - 1)
 57            if self.list_stack and self.list_stack[-1] == 'ul':
 58                self.markdown.append(f'{indent}- ')
 59            elif self.list_stack and self.list_stack[-1] == 'ol':
 60                self.markdown.append(f'{indent}1. ')
 61        elif tag == 'a':
 62            href = attrs_dict.get('href', '')
 63            self.link_stack.append(href)
 64            self.markdown.append('[')
 65        elif tag == 'img':
 66            src = attrs_dict.get('src', '')
 67            alt = attrs_dict.get('alt', '')
 68            title = attrs_dict.get('title', '')
 69            if title:
 70                self.markdown.append(f'![{alt}]({src} "{title}")')
 71            else:
 72                self.markdown.append(f'![{alt}]({src})')
 73        elif tag == 'hr':
 74            self.markdown.append('\n---\n')
 75        elif tag == 'table':
 76            self.in_table = True
 77            self.table_data = []
 78        elif tag == 'tr':
 79            self.current_row = []
 80        elif tag == 'th' or tag == 'td':
 81            pass  # Handle in handle_data
 82        elif tag == 'del' or tag == 's':
 83            self.markdown.append('~~')
 84            
 85        self.current_tag_stack.append(tag)
 86    
 87    def handle_endtag(self, tag):
 88        if not self.current_tag_stack:
 89            return
 90            
 91        # Remove the tag from stack
 92        if tag in self.current_tag_stack:
 93            self.current_tag_stack.remove(tag)
 94        
 95        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
 96            self.markdown.append('\n')
 97        elif tag == 'p':
 98            self.markdown.append('\n\n')
 99        elif tag == 'strong' or tag == 'b':
100            self.markdown.append('**')
101        elif tag == 'em' or tag == 'i':
102            self.markdown.append('*')
103        elif tag == 'code':
104            if not self.in_pre and self.in_code:
105                self.markdown.append('`')
106                self.in_code = False
107        elif tag == 'pre':
108            self.markdown.append('\n```\n')
109            self.in_pre = False
110        elif tag == 'blockquote':
111            self.markdown.append('\n')
112        elif tag == 'ul' or tag == 'ol':
113            if self.list_stack:
114                self.list_stack.pop()
115            self.markdown.append('\n')
116        elif tag == 'li':
117            self.markdown.append('\n')
118        elif tag == 'a':
119            if self.link_stack:
120                href = self.link_stack.pop()
121                self.markdown.append(f']({href})')
122        elif tag == 'table':
123            self.in_table = False
124            self._process_table()
125        elif tag == 'tr':
126            if self.in_table:
127                self.table_data.append(self.current_row[:])
128                self.current_row = []
129        elif tag == 'del' or tag == 's':
130            self.markdown.append('~~')
131    
132    def handle_data(self, data):
133        # Clean up whitespace, but preserve intentional spacing
134        if self.in_pre:
135            self.markdown.append(data)
136        else:
137            # Check if we're in a table cell
138            if self.in_table and (not self.current_tag_stack or 
139                                self.current_tag_stack[-1] in ['td', 'th']):
140                self.current_row.append(data.strip())
141            else:
142                cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
143                # Remove leading/trailing whitespace only from the entire content
144                if cleaned_data.strip():
145                    self.markdown.append(cleaned_data)
146    
147    def _process_table(self):
148        if not self.table_data:
149            return
150            
151        self.markdown.append('\n')
152        
153        # Process header row if exists
154        if self.table_data:
155            header = self.table_data[0]
156            self.markdown.append('| ' + ' | '.join(header) + ' |\n')
157            self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
158            
159            # Process data rows
160            for row in self.table_data[1:]:
161                # Pad row to match header length
162                while len(row) < len(header):
163                    row.append('')
164                self.markdown.append('| ' + ' | '.join(row) + ' |\n')
165        
166        self.markdown.append('\n')
167    
168    def get_markdown(self):
169        return ''.join(self.markdown)
170    
171    def reset(self):
172        """Reset the parser state for reuse."""
173        super().reset()
174        self.markdown = []
175        self.current_tag_stack = []
176        self.list_stack = []
177        self.in_pre = False
178        self.in_code = False
179        self.table_data = []
180        self.current_row = []
181        self.in_table = False
182        self.link_stack = []