social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from html.parser import HTMLParser 2from typing import override 3import cross.fragments as f 4 5class HTMLToFragmentsParser(HTMLParser): 6 def __init__(self) -> None: 7 super().__init__() 8 self.builder: bytearray = bytearray() 9 self.fragments: list[f.Fragment] = [] 10 11 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {} 12 self.in_pre: bool = False 13 self.in_code: bool = False 14 self.invisible: bool = False 15 16 def handle_a_endtag(self): 17 current_end = len(self.builder) 18 start, _attr = self._tag_stack.pop("a") 19 20 href = _attr.get('href') 21 if href and current_end > start: 22 self.fragments.append( 23 f.LinkFragment(start=start, end=current_end, url=href) 24 ) 25 26 @override 27 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 28 _attr = dict(attrs) 29 30 def append_newline(): 31 if self.builder and not self.builder.endswith(b"\n"): 32 self.builder.extend(b"\n") 33 34 if self.invisible: 35 return 36 37 match tag: 38 case "p": 39 cls = _attr.get('class', '') 40 if cls and 'quote-inline' in cls: 41 self.invisible = True 42 case "a": 43 self._tag_stack["a"] = (len(self.builder), _attr) 44 case "code": 45 if not self.in_pre: 46 self.builder.extend(b"`") 47 self.in_code = True 48 case "pre": 49 append_newline() 50 self.builder.extend(b"```\n") 51 self.in_pre = True 52 case "blockquote": 53 append_newline() 54 self.builder.extend(b"> ") 55 case "strong" | "b": 56 self.builder.extend(b"**") 57 case "em" | "i": 58 self.builder.extend(b"*") 59 case "del" | "s": 60 self.builder.extend(b"~~") 61 case "br": 62 self.builder.extend(b"\n") 63 case _: 64 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}: 65 level = int(tag[1]) 66 self.builder.extend(("\n" + "#" * level + " ").encode('utf-8')) 67 68 @override 69 def handle_endtag(self, tag: str) -> None: 70 if self.invisible: 71 if tag == "p": 72 self.invisible = False 73 return 74 75 match tag: 76 case "a": 77 if "a" in self._tag_stack: 78 self.handle_a_endtag() 79 case "code": 80 if not self.in_pre and self.in_code: 81 self.builder.extend(b"`") 82 self.in_code = False 83 case "pre": 84 self.builder.extend(b"\n```\n") 85 self.in_pre = False 86 case "blockquote": 87 self.builder.extend(b"\n") 88 case "strong" | "b": 89 self.builder.extend(b"**") 90 case "em" | "i": 91 self.builder.extend(b"*") 92 case "del" | "s": 93 self.builder.extend(b"~~") 94 case "p": 95 self.builder.extend(b"\n\n") 96 case _: 97 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: 98 self.builder.extend(b'\n') 99 100 @override 101 def handle_data(self, data: str) -> None: 102 if not self.invisible: 103 self.builder.extend(data.encode('utf-8')) 104 105 def get_result(self) -> tuple[str, list[f.Fragment]]: 106 if self.builder.endswith(b'\n\n'): 107 return self.builder[:-2].decode('utf-8'), self.fragments 108 return self.builder.decode('utf-8'), self.fragments