social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1import re 2from html.parser import HTMLParser 3from html import unescape 4 5### VIBECODED CODE ALERT!!! ### 6 7class HTMLToMarkdownParser(HTMLParser): 8 def __init__(self): 9 super().__init__() 10 self.markdown = [] 11 12 self.in_pre = False 13 self.in_code = False 14 15 self.current_tag_stack = [] 16 self.list_stack = [] 17 18 self.table_data = [] 19 self.current_row = [] 20 self.in_table = False 21 22 self.link_stack = [] 23 self.preserve_spaces = False 24 25 def handle_starttag(self, tag, attrs): 26 attrs_dict = dict(attrs) 27 28 if tag == {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: 29 level = int(tag[1]) 30 self.markdown.append("\n" + "#" * level + " ") 31 elif tag == 'p': 32 #self.markdown.append('\n\n') 33 pass 34 elif tag == 'br': 35 self.markdown.append(' \n') 36 elif tag == 'strong' or tag == 'b': 37 self.markdown.append('**') 38 elif tag == 'em' or tag == 'i': 39 self.markdown.append('*') 40 elif tag == 'code': 41 if not self.in_pre: 42 self.markdown.append('`') 43 self.in_code = True 44 elif tag == 'pre': 45 self.markdown.append('\n```\n') 46 self.in_pre = True 47 elif tag == 'blockquote': 48 self.markdown.append('\n> ') 49 elif tag == 'ul': 50 self.list_stack.append('ul') 51 self.markdown.append('\n') 52 elif tag == 'ol': 53 self.list_stack.append('ol') 54 self.markdown.append('\n') 55 elif tag == 'li': 56 indent = ' ' * (len(self.list_stack) - 1) 57 if self.list_stack and self.list_stack[-1] == 'ul': 58 self.markdown.append(f'{indent}- ') 59 elif self.list_stack and self.list_stack[-1] == 'ol': 60 self.markdown.append(f'{indent}1. ') 61 elif tag == 'a': 62 href = attrs_dict.get('href', '') 63 self.link_stack.append(href) 64 self.markdown.append('[') 65 elif tag == 'img': 66 src = attrs_dict.get('src', '') 67 alt = attrs_dict.get('alt', '') 68 title = attrs_dict.get('title', '') 69 if title: 70 self.markdown.append(f'![{alt}]({src} "{title}")') 71 else: 72 self.markdown.append(f'![{alt}]({src})') 73 elif tag == 'hr': 74 self.markdown.append('\n---\n') 75 elif tag == 'table': 76 self.in_table = True 77 self.table_data = [] 78 elif tag == 'tr': 79 self.current_row = [] 80 elif tag == 'th' or tag == 'td': 81 pass # Handle in handle_data 82 elif tag == 'del' or tag == 's': 83 self.markdown.append('~~') 84 85 self.current_tag_stack.append(tag) 86 87 def handle_endtag(self, tag): 88 if not self.current_tag_stack: 89 return 90 91 # Remove the tag from stack 92 if tag in self.current_tag_stack: 93 self.current_tag_stack.remove(tag) 94 95 if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: 96 self.markdown.append('\n') 97 elif tag == 'p': 98 self.markdown.append('\n\n') 99 elif tag == 'strong' or tag == 'b': 100 self.markdown.append('**') 101 elif tag == 'em' or tag == 'i': 102 self.markdown.append('*') 103 elif tag == 'code': 104 if not self.in_pre and self.in_code: 105 self.markdown.append('`') 106 self.in_code = False 107 elif tag == 'pre': 108 self.markdown.append('\n```\n') 109 self.in_pre = False 110 elif tag == 'blockquote': 111 self.markdown.append('\n') 112 elif tag == 'ul' or tag == 'ol': 113 if self.list_stack: 114 self.list_stack.pop() 115 self.markdown.append('\n') 116 elif tag == 'li': 117 self.markdown.append('\n') 118 elif tag == 'a': 119 if self.link_stack: 120 href = self.link_stack.pop() 121 self.markdown.append(f']({href})') 122 elif tag == 'table': 123 self.in_table = False 124 self._process_table() 125 elif tag == 'tr': 126 if self.in_table: 127 self.table_data.append(self.current_row[:]) 128 self.current_row = [] 129 elif tag == 'del' or tag == 's': 130 self.markdown.append('~~') 131 132 def handle_data(self, data): 133 # Clean up whitespace, but preserve intentional spacing 134 if self.in_pre: 135 self.markdown.append(data) 136 else: 137 # Check if we're in a table cell 138 if self.in_table and (not self.current_tag_stack or 139 self.current_tag_stack[-1] in ['td', 'th']): 140 self.current_row.append(data.strip()) 141 else: 142 cleaned_data = re.sub(r'[\r\n\t]+', ' ', data) 143 # Remove leading/trailing whitespace only from the entire content 144 if cleaned_data.strip(): 145 self.markdown.append(cleaned_data) 146 147 def _process_table(self): 148 if not self.table_data: 149 return 150 151 self.markdown.append('\n') 152 153 # Process header row if exists 154 if self.table_data: 155 header = self.table_data[0] 156 self.markdown.append('| ' + ' | '.join(header) + ' |\n') 157 self.markdown.append('| ' + ' | '.join(['---'] * len(header)) + ' |\n') 158 159 # Process data rows 160 for row in self.table_data[1:]: 161 # Pad row to match header length 162 while len(row) < len(header): 163 row.append('') 164 self.markdown.append('| ' + ' | '.join(row) + ' |\n') 165 166 self.markdown.append('\n') 167 168 def get_markdown(self): 169 return ''.join(self.markdown) 170 171 def reset(self): 172 """Reset the parser state for reuse.""" 173 super().reset() 174 self.markdown = [] 175 self.current_tag_stack = [] 176 self.list_stack = [] 177 self.in_pre = False 178 self.in_code = False 179 self.table_data = [] 180 self.current_row = [] 181 self.in_table = False 182 self.link_stack = []