social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from html.parser import HTMLParser 2from typing import override 3import cross.fragments as f 4 5class HTMLToFragmentsParser(HTMLParser): 6 def __init__(self) -> None: 7 super().__init__() 8 self.builder: bytearray = bytearray() 9 self.fragments: list[f.Fragment] = [] 10 11 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {} 12 self.in_pre: bool = False 13 self.in_code: bool = False 14 self.invisible: bool = False 15 16 def handle_a_endtag(self): 17 current_end = len(self.builder) 18 start, _attr = self._tag_stack.pop("a") 19 20 href = _attr.get('href') 21 if href and current_end > start: 22 self.fragments.append( 23 f.LinkFragment(start=start, end=current_end, url=href) 24 ) 25 26 def append_newline(self): 27 if self.builder and not self.builder.endswith(b"\n"): 28 self.builder.extend(b"\n") 29 30 @override 31 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 32 _attr = dict(attrs) 33 34 if self.invisible: 35 return 36 37 match tag: 38 case "p": 39 cls = _attr.get('class', '') 40 if cls and 'quote-inline' in cls: 41 self.invisible = True 42 case "a": 43 self._tag_stack["a"] = (len(self.builder), _attr) 44 case "code": 45 if not self.in_pre: 46 self.builder.extend(b"`") 47 self.in_code = True 48 case "pre": 49 self.append_newline() 50 self.builder.extend(b"```\n") 51 self.in_pre = True 52 case "blockquote": 53 self.append_newline() 54 self.builder.extend(b"> ") 55 case "strong" | "b": 56 self.builder.extend(b"**") 57 case "em" | "i": 58 self.builder.extend(b"*") 59 case "del" | "s": 60 self.builder.extend(b"~~") 61 case "br": 62 self.builder.extend(b"\n") 63 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 64 level = int(tag[1]) 65 self.builder.extend(("\n" + "#" * level + " ").encode('utf-8')) 66 case _: 67 #self.builder.extend(f"<{tag}>".encode("utf-8")) 68 pass 69 70 71 @override 72 def handle_endtag(self, tag: str) -> None: 73 if self.invisible: 74 if tag == "p": 75 self.invisible = False 76 return 77 78 match tag: 79 case "a": 80 if "a" in self._tag_stack: 81 self.handle_a_endtag() 82 case "code": 83 if not self.in_pre and self.in_code: 84 self.builder.extend(b"`") 85 self.in_code = False 86 case "pre": 87 self.append_newline() 88 self.builder.extend(b"```\n") 89 self.in_pre = False 90 case "blockquote": 91 self.builder.extend(b"\n") 92 case "strong" | "b": 93 self.builder.extend(b"**") 94 case "em" | "i": 95 self.builder.extend(b"*") 96 case "del" | "s": 97 self.builder.extend(b"~~") 98 case "p": 99 self.builder.extend(b"\n\n") 100 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 101 self.builder.extend(b'\n') 102 case _: 103 #self.builder.extend(f"</{tag}>".encode("utf-8")) 104 pass 105 106 @override 107 def handle_data(self, data: str) -> None: 108 if not self.invisible: 109 self.builder.extend(data.encode('utf-8')) 110 111 def get_result(self) -> tuple[str, list[f.Fragment]]: 112 if self.builder.endswith(b'\n\n'): 113 return self.builder[:-2].decode('utf-8'), self.fragments 114 if self.builder.endswith(b'\n'): 115 return self.builder[:-1].decode('utf-8'), self.fragments 116 return self.builder.decode('utf-8'), self.fragments