social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2from html.parser import HTMLParser
3from html import unescape
4
5### VIBECODED CODE ALERT!!! ###
6
7class HTMLToMarkdownParser(HTMLParser):
8 def __init__(self):
9 super().__init__()
10 self.markdown = []
11
12 self.in_pre = False
13 self.in_code = False
14
15 self.current_tag_stack = []
16 self.list_stack = []
17
18 self.table_data = []
19 self.current_row = []
20 self.in_table = False
21
22 self.link_stack = []
23 self.preserve_spaces = False
24
25 def handle_starttag(self, tag, attrs):
26 attrs_dict = dict(attrs)
27
28 if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
29 level = int(tag[1])
30 self.markdown.append("\n" + "#" * level + " ")
31 elif tag == 'p':
32 #self.markdown.append('\n\n')
33 pass
34 elif tag == 'br':
35 self.markdown.append(' \n')
36 elif tag == 'strong' or tag == 'b':
37 self.markdown.append('**')
38 elif tag == 'em' or tag == 'i':
39 self.markdown.append('*')
40 elif tag == 'code':
41 if not self.in_pre:
42 self.markdown.append('`')
43 self.in_code = True
44 elif tag == 'pre':
45 self.markdown.append('\n```\n')
46 self.in_pre = True
47 elif tag == 'blockquote':
48 self.markdown.append('\n> ')
49 elif tag == 'ul':
50 self.list_stack.append('ul')
51 self.markdown.append('\n')
52 elif tag == 'ol':
53 self.list_stack.append('ol')
54 self.markdown.append('\n')
55 elif tag == 'li':
56 indent = ' ' * (len(self.list_stack) - 1)
57 if self.list_stack and self.list_stack[-1] == 'ul':
58 self.markdown.append(f'{indent}- ')
59 elif self.list_stack and self.list_stack[-1] == 'ol':
60 self.markdown.append(f'{indent}1. ')
61 elif tag == 'a':
62 href = attrs_dict.get('href', '')
63 self.link_stack.append(href)
64 self.markdown.append('[')
65 elif tag == 'img':
66 src = attrs_dict.get('src', '')
67 alt = attrs_dict.get('alt', '')
68 title = attrs_dict.get('title', '')
69 if title:
70 self.markdown.append(f'')
71 else:
72 self.markdown.append(f'')
73 elif tag == 'hr':
74 self.markdown.append('\n---\n')
75 elif tag == 'table':
76 self.in_table = True
77 self.table_data = []
78 elif tag == 'tr':
79 self.current_row = []
80 elif tag == 'th' or tag == 'td':
81 pass # Handle in handle_data
82 elif tag == 'del' or tag == 's':
83 self.markdown.append('~~')
84
85 self.current_tag_stack.append(tag)
86
87 def handle_endtag(self, tag):
88 if not self.current_tag_stack:
89 return
90
91 # Remove the tag from stack
92 if tag in self.current_tag_stack:
93 self.current_tag_stack.remove(tag)
94
95 if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
96 self.markdown.append('\n')
97 elif tag == 'p':
98 self.markdown.append('\n\n')
99 elif tag == 'strong' or tag == 'b':
100 self.markdown.append('**')
101 elif tag == 'em' or tag == 'i':
102 self.markdown.append('*')
103 elif tag == 'code':
104 if not self.in_pre and self.in_code:
105 self.markdown.append('`')
106 self.in_code = False
107 elif tag == 'pre':
108 self.markdown.append('\n```\n')
109 self.in_pre = False
110 elif tag == 'blockquote':
111 self.markdown.append('\n')
112 elif tag == 'ul' or tag == 'ol':
113 if self.list_stack:
114 self.list_stack.pop()
115 self.markdown.append('\n')
116 elif tag == 'li':
117 self.markdown.append('\n')
118 elif tag == 'a':
119 if self.link_stack:
120 href = self.link_stack.pop()
121 self.markdown.append(f']({href})')
122 elif tag == 'table':
123 self.in_table = False
124 self._process_table()
125 elif tag == 'tr':
126 if self.in_table:
127 self.table_data.append(self.current_row[:])
128 self.current_row = []
129 elif tag == 'del' or tag == 's':
130 self.markdown.append('~~')
131
132 def handle_data(self, data):
133 # Clean up whitespace, but preserve intentional spacing
134 if self.in_pre:
135 self.markdown.append(data)
136 else:
137 # Check if we're in a table cell
138 if self.in_table and (not self.current_tag_stack or
139 self.current_tag_stack[-1] in ['td', 'th']):
140 self.current_row.append(data.strip())
141 else:
142 cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
143 # Remove leading/trailing whitespace only from the entire content
144 if cleaned_data.strip():
145 self.markdown.append(cleaned_data)
146
147 def _process_table(self):
148 if not self.table_data:
149 return
150
151 self.markdown.append('\n')
152
153 # Process header row if exists
154 if self.table_data:
155 header = self.table_data[0]
156 self.markdown.append('| ' + ' | '.join(header) + ' |\n')
157 self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
158
159 # Process data rows
160 for row in self.table_data[1:]:
161 # Pad row to match header length
162 while len(row) < len(header):
163 row.append('')
164 self.markdown.append('| ' + ' | '.join(row) + ' |\n')
165
166 self.markdown.append('\n')
167
168 def get_markdown(self):
169 return ''.join(self.markdown)
170
171 def reset(self):
172 """Reset the parser state for reuse."""
173 super().reset()
174 self.markdown = []
175 self.current_tag_stack = []
176 self.list_stack = []
177 self.in_pre = False
178 self.in_code = False
179 self.table_data = []
180 self.current_row = []
181 self.in_table = False
182 self.link_stack = []