social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky

try this for line breaks?

zenfyr.dev 3fb03120 30c2c81a

verified
Changed files
+61 -109
mastodon
+61 -109
mastodon/markeddown.py
···
-
import re
from html.parser import HTMLParser
-
from html import unescape
-
### VIBECODED CODE ALERT!!! ###
-
-
class HTMLToMarkdownParser(HTMLParser):
-
def __init__(self):
super().__init__()
self.markdown = []
self.in_pre = False
self.in_code = False
-
self.current_tag_stack = []
self.list_stack = []
-
-
self.table_data = []
-
self.current_row = []
-
self.in_table = False
-
self.link_stack = []
-
self.preserve_spaces = False
-
-
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
-
if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
-
level = int(tag[1])
-
self.markdown.append("\n" + "#" * level + " ")
-
elif tag == 'p':
-
#self.markdown.append('\n\n')
-
pass
-
elif tag == 'br':
self.markdown.append(' \n')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
elif tag == 'code':
if not self.in_pre:
self.markdown.append('`')
self.in_code = True
elif tag == 'pre':
-
self.markdown.append('\n```\n')
self.in_pre = True
elif tag == 'blockquote':
self.markdown.append('\n> ')
elif tag == 'ul':
self.list_stack.append('ul')
self.markdown.append('\n')
elif tag == 'ol':
self.list_stack.append('ol')
self.markdown.append('\n')
elif tag == 'li':
indent = ' ' * (len(self.list_stack) - 1)
if self.list_stack and self.list_stack[-1] == 'ul':
self.markdown.append(f'{indent}- ')
elif self.list_stack and self.list_stack[-1] == 'ol':
self.markdown.append(f'{indent}1. ')
-
elif tag == 'a':
-
href = attrs_dict.get('href', '')
-
self.link_stack.append(href)
-
self.markdown.append('[')
-
elif tag == 'img':
-
src = attrs_dict.get('src', '')
-
alt = attrs_dict.get('alt', '')
-
title = attrs_dict.get('title', '')
-
if title:
-
self.markdown.append(f'![{alt}]({src} "{title}")')
-
else:
-
self.markdown.append(f'![{alt}]({src})')
-
elif tag == 'hr':
-
self.markdown.append('\n---\n')
-
elif tag == 'table':
-
self.in_table = True
-
self.table_data = []
-
elif tag == 'tr':
-
self.current_row = []
-
elif tag == 'th' or tag == 'td':
-
pass # Handle in handle_data
-
elif tag == 'del' or tag == 's':
-
self.markdown.append('~~')
-
self.current_tag_stack.append(tag)
-
def handle_endtag(self, tag):
if not self.current_tag_stack:
return
-
-
# Remove the tag from stack
if tag in self.current_tag_stack:
self.current_tag_stack.remove(tag)
-
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-
self.markdown.append('\n')
-
elif tag == 'p':
self.markdown.append('\n\n')
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
elif tag == 'code':
if not self.in_pre and self.in_code:
self.markdown.append('`')
self.in_code = False
elif tag == 'pre':
self.markdown.append('\n```\n')
self.in_pre = False
elif tag == 'blockquote':
self.markdown.append('\n')
elif tag == 'ul' or tag == 'ol':
if self.list_stack:
self.list_stack.pop()
self.markdown.append('\n')
elif tag == 'li':
self.markdown.append('\n')
-
elif tag == 'a':
-
if self.link_stack:
-
href = self.link_stack.pop()
-
self.markdown.append(f']({href})')
-
elif tag == 'table':
-
self.in_table = False
-
self._process_table()
-
elif tag == 'tr':
-
if self.in_table:
-
self.table_data.append(self.current_row[:])
-
self.current_row = []
-
elif tag == 'del' or tag == 's':
-
self.markdown.append('~~')
-
-
def handle_data(self, data):
-
# Clean up whitespace, but preserve intentional spacing
-
if self.in_pre:
-
self.markdown.append(data)
-
else:
-
# Check if we're in a table cell
-
if self.in_table and (not self.current_tag_stack or
-
self.current_tag_stack[-1] in ['td', 'th']):
-
self.current_row.append(data.strip())
-
else:
-
cleaned_data = re.sub(r'[\r\n\t]+', ' ', data)
-
# Remove leading/trailing whitespace only from the entire content
-
if cleaned_data.strip():
-
self.markdown.append(cleaned_data)
-
-
def _process_table(self):
-
if not self.table_data:
-
return
-
-
self.markdown.append('\n')
-
# Process header row if exists
-
if self.table_data:
-
header = self.table_data[0]
-
self.markdown.append('| ' + ' | '.join(header) + ' |\n')
-
self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n')
-
-
# Process data rows
-
for row in self.table_data[1:]:
-
# Pad row to match header length
-
while len(row) < len(header):
-
row.append('')
-
self.markdown.append('| ' + ' | '.join(row) + ' |\n')
-
-
self.markdown.append('\n')
-
-
def get_markdown(self):
-
return ''.join(self.markdown)
def reset(self):
"""Reset the parser state for reuse."""
super().reset()
self.markdown = []
self.current_tag_stack = []
-
self.list_stack = []
self.in_pre = False
self.in_code = False
-
self.table_data = []
-
self.current_row = []
-
self.in_table = False
-
self.link_stack = []
···
from html.parser import HTMLParser
+
class MastoHTMLToMarkdownParser(HTMLParser):
+
def __init__(self) -> None:
super().__init__()
self.markdown = []
+
self.current_tag_stack = []
self.in_pre = False
self.in_code = False
self.list_stack = []
self.link_stack = []
+
+
def get_markdown(self):
+
return ''.join(self.markdown)
+
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
attrs_dict = dict(attrs)
+
if tag == 'br':
self.markdown.append(' \n')
+
+
elif tag == 'a':
+
href = attrs_dict.get('href', '')
+
self.link_stack.append(href)
+
self.markdown.append('[')
+
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
+
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
+
+
elif tag == 'del' or tag == 's':
+
self.markdown.append('~~')
+
elif tag == 'code':
if not self.in_pre:
self.markdown.append('`')
self.in_code = True
+
elif tag == 'pre':
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
+
self.markdown.append('\n')
+
+
self.markdown.append('```\n')
self.in_pre = True
+
elif tag == 'blockquote':
+
if self.markdown and not str(self.markdown[-1]).endswith('\n'):
+
self.markdown.append('\n')
+
self.markdown.append('\n> ')
+
elif tag == 'ul':
self.list_stack.append('ul')
self.markdown.append('\n')
+
elif tag == 'ol':
self.list_stack.append('ol')
self.markdown.append('\n')
+
elif tag == 'li':
indent = ' ' * (len(self.list_stack) - 1)
if self.list_stack and self.list_stack[-1] == 'ul':
self.markdown.append(f'{indent}- ')
elif self.list_stack and self.list_stack[-1] == 'ol':
self.markdown.append(f'{indent}1. ')
+
+
elif tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
+
level = int(tag[1])
+
self.markdown.append("\n" + "#" * level + " ")
+
self.current_tag_stack.append(tag)
+
def handle_endtag(self, tag: str) -> None:
if not self.current_tag_stack:
return
+
if tag in self.current_tag_stack:
self.current_tag_stack.remove(tag)
+
if tag == 'p':
self.markdown.append('\n\n')
+
+
elif tag == 'a':
+
if self.link_stack:
+
href = self.link_stack.pop()
+
self.markdown.append(f']({href})')
+
elif tag == 'strong' or tag == 'b':
self.markdown.append('**')
+
elif tag == 'em' or tag == 'i':
self.markdown.append('*')
+
+
elif tag == 'del' or tag == 's':
+
self.markdown.append('~~')
+
elif tag == 'code':
if not self.in_pre and self.in_code:
self.markdown.append('`')
self.in_code = False
+
elif tag == 'pre':
self.markdown.append('\n```\n')
self.in_pre = False
+
elif tag == 'blockquote':
self.markdown.append('\n')
+
elif tag == 'ul' or tag == 'ol':
if self.list_stack:
self.list_stack.pop()
self.markdown.append('\n')
+
elif tag == 'li':
self.markdown.append('\n')
+
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+
self.markdown.append('\n')
def reset(self):
"""Reset the parser state for reuse."""
super().reset()
self.markdown = []
self.current_tag_stack = []
+
self.in_pre = False
self.in_code = False
+
+
self.link_stack = []
+
self.list_stack = []