···
from html.parser import HTMLParser
3
-
from html import unescape
5
-
### VIBECODED CODE ALERT!!! ###
7
-
class HTMLToMarkdownParser(HTMLParser):
3
+
class MastoHTMLToMarkdownParser(HTMLParser):
4
+
def __init__(self) -> None:
7
+
self.current_tag_stack = []
15
-
self.current_tag_stack = []
18
-
self.table_data = []
19
-
self.current_row = []
20
-
self.in_table = False
23
-
self.preserve_spaces = False
25
-
def handle_starttag(self, tag, attrs):
15
+
def get_markdown(self):
16
+
return ''.join(self.markdown)
18
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
28
-
if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
30
-
self.markdown.append("\n" + "#" * level + " ")
32
-
#self.markdown.append('\n\n')
self.markdown.append(' \n')
25
+
href = attrs_dict.get('href', '')
26
+
self.link_stack.append(href)
27
+
self.markdown.append('[')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
35
+
elif tag == 'del' or tag == 's':
36
+
self.markdown.append('~~')
self.markdown.append('`')
45
-
self.markdown.append('\n```\n')
44
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
45
+
self.markdown.append('\n')
47
+
self.markdown.append('```\n')
elif tag == 'blockquote':
51
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
52
+
self.markdown.append('\n')
self.markdown.append('\n> ')
self.list_stack.append('ul')
self.markdown.append('\n')
self.list_stack.append('ol')
self.markdown.append('\n')
indent = ' ' * (len(self.list_stack) - 1)
if self.list_stack and self.list_stack[-1] == 'ul':
self.markdown.append(f'{indent}- ')
elif self.list_stack and self.list_stack[-1] == 'ol':
self.markdown.append(f'{indent}1. ')
62
-
href = attrs_dict.get('href', '')
63
-
self.link_stack.append(href)
64
-
self.markdown.append('[')
66
-
src = attrs_dict.get('src', '')
67
-
alt = attrs_dict.get('alt', '')
68
-
title = attrs_dict.get('title', '')
70
-
self.markdown.append(f'')
72
-
self.markdown.append(f'')
74
-
self.markdown.append('\n---\n')
75
-
elif tag == 'table':
76
-
self.in_table = True
77
-
self.table_data = []
79
-
self.current_row = []
80
-
elif tag == 'th' or tag == 'td':
81
-
pass # Handle in handle_data
82
-
elif tag == 'del' or tag == 's':
83
-
self.markdown.append('~~')
71
+
elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
73
+
self.markdown.append("\n" + "#" * level + " ")
self.current_tag_stack.append(tag)
87
-
def handle_endtag(self, tag):
77
+
def handle_endtag(self, tag: str) -> None:
if not self.current_tag_stack:
91
-
# Remove the tag from stack
if tag in self.current_tag_stack:
self.current_tag_stack.remove(tag)
95
-
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
96
-
self.markdown.append('\n')
self.markdown.append('\n\n')
89
+
href = self.link_stack.pop()
90
+
self.markdown.append(f']({href})')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
98
+
elif tag == 'del' or tag == 's':
99
+
self.markdown.append('~~')
if not self.in_pre and self.in_code:
self.markdown.append('`')
self.markdown.append('\n```\n')
elif tag == 'blockquote':
self.markdown.append('\n')
elif tag == 'ul' or tag == 'ol':
self.markdown.append('\n')
self.markdown.append('\n')
119
-
if self.link_stack:
120
-
href = self.link_stack.pop()
121
-
self.markdown.append(f']({href})')
122
-
elif tag == 'table':
123
-
self.in_table = False
124
-
self._process_table()
127
-
self.table_data.append(self.current_row[:])
128
-
self.current_row = []
129
-
elif tag == 'del' or tag == 's':
130
-
self.markdown.append('~~')
132
-
def handle_data(self, data):
133
-
# Clean up whitespace, but preserve intentional spacing
135
-
self.markdown.append(data)
137
-
# Check if we're in a table cell
138
-
if self.in_table and (not self.current_tag_stack or
139
-
self.current_tag_stack[-1] in ['td', 'th']):
140
-
self.current_row.append(data.strip())
142
-
cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
143
-
# Remove leading/trailing whitespace only from the entire content
144
-
if cleaned_data.strip():
145
-
self.markdown.append(cleaned_data)
147
-
def _process_table(self):
148
-
if not self.table_data:
151
-
self.markdown.append('\n')
153
-
# Process header row if exists
154
-
if self.table_data:
155
-
header = self.table_data[0]
156
-
self.markdown.append('| ' + ' | '.join(header) + ' |\n')
157
-
self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
159
-
# Process data rows
160
-
for row in self.table_data[1:]:
161
-
# Pad row to match header length
162
-
while len(row) < len(header):
164
-
self.markdown.append('| ' + ' | '.join(row) + ' |\n')
166
-
self.markdown.append('\n')
168
-
def get_markdown(self):
169
-
return ''.join(self.markdown)
121
+
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
122
+
self.markdown.append('\n')
"""Reset the parser state for reuse."""
self.current_tag_stack = []
176
-
self.list_stack = []
179
-
self.table_data = []
180
-
self.current_row = []
181
-
self.in_table = False
182
-
self.link_stack = []
133
+
self.link_stack = []
134
+
self.list_stack = []