···
from html.parser import HTMLParser
-
from html import unescape
-
### VIBECODED CODE ALERT!!! ###
-
class HTMLToMarkdownParser(HTMLParser):
-
self.current_tag_stack = []
-
self.preserve_spaces = False
-
def handle_starttag(self, tag, attrs):
-
if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
-
self.markdown.append("\n" + "#" * level + " ")
-
#self.markdown.append('\n\n')
self.markdown.append(' \n')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
self.markdown.append('`')
-
self.markdown.append('\n```\n')
elif tag == 'blockquote':
self.markdown.append('\n> ')
self.list_stack.append('ul')
self.markdown.append('\n')
self.list_stack.append('ol')
self.markdown.append('\n')
indent = ' ' * (len(self.list_stack) - 1)
if self.list_stack and self.list_stack[-1] == 'ul':
self.markdown.append(f'{indent}- ')
elif self.list_stack and self.list_stack[-1] == 'ol':
self.markdown.append(f'{indent}1. ')
-
href = attrs_dict.get('href', '')
-
self.link_stack.append(href)
-
self.markdown.append('[')
-
src = attrs_dict.get('src', '')
-
alt = attrs_dict.get('alt', '')
-
title = attrs_dict.get('title', '')
-
self.markdown.append(f'')
-
self.markdown.append(f'')
-
self.markdown.append('\n---\n')
-
elif tag == 'th' or tag == 'td':
-
pass # Handle in handle_data
-
elif tag == 'del' or tag == 's':
-
self.markdown.append('~~')
self.current_tag_stack.append(tag)
-
def handle_endtag(self, tag):
if not self.current_tag_stack:
-
# Remove the tag from stack
if tag in self.current_tag_stack:
self.current_tag_stack.remove(tag)
-
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-
self.markdown.append('\n')
self.markdown.append('\n\n')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
if not self.in_pre and self.in_code:
self.markdown.append('`')
self.markdown.append('\n```\n')
elif tag == 'blockquote':
self.markdown.append('\n')
elif tag == 'ul' or tag == 'ol':
self.markdown.append('\n')
self.markdown.append('\n')
-
href = self.link_stack.pop()
-
self.markdown.append(f']({href})')
-
self.table_data.append(self.current_row[:])
-
elif tag == 'del' or tag == 's':
-
self.markdown.append('~~')
-
def handle_data(self, data):
-
# Clean up whitespace, but preserve intentional spacing
-
self.markdown.append(data)
-
# Check if we're in a table cell
-
if self.in_table and (not self.current_tag_stack or
-
self.current_tag_stack[-1] in ['td', 'th']):
-
self.current_row.append(data.strip())
-
cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
-
# Remove leading/trailing whitespace only from the entire content
-
if cleaned_data.strip():
-
self.markdown.append(cleaned_data)
-
def _process_table(self):
-
if not self.table_data:
-
self.markdown.append('\n')
-
# Process header row if exists
-
header = self.table_data[0]
-
self.markdown.append('| ' + ' | '.join(header) + ' |\n')
-
self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
-
for row in self.table_data[1:]:
-
# Pad row to match header length
-
while len(row) < len(header):
-
self.markdown.append('| ' + ' | '.join(row) + ' |\n')
-
self.markdown.append('\n')
-
def get_markdown(self):
-
return ''.join(self.markdown)
"""Reset the parser state for reuse."""
self.current_tag_stack = []