social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2from typing import override
3import cross.fragments as f
4
5class HTMLToFragmentsParser(HTMLParser):
6 def __init__(self) -> None:
7 super().__init__()
8 self.builder: bytearray = bytearray()
9 self.fragments: list[f.Fragment] = []
10
11 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
12 self.in_pre: bool = False
13 self.in_code: bool = False
14 self.invisible: bool = False
15
16 def handle_a_endtag(self):
17 current_end = len(self.builder)
18 start, _attr = self._tag_stack.pop("a")
19
20 href = _attr.get('href')
21 if href and current_end > start:
22 self.fragments.append(
23 f.LinkFragment(start=start, end=current_end, url=href)
24 )
25
26 @override
27 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
28 _attr = dict(attrs)
29
30 def append_newline():
31 if self.builder and not self.builder.endswith(b"\n"):
32 self.builder.extend(b"\n")
33
34 if self.invisible:
35 return
36
37 match tag:
38 case "p":
39 cls = _attr.get('class', '')
40 if cls and 'quote-inline' in cls:
41 self.invisible = True
42 case "a":
43 self._tag_stack["a"] = (len(self.builder), _attr)
44 case "code":
45 if not self.in_pre:
46 self.builder.extend(b"`")
47 self.in_code = True
48 case "pre":
49 append_newline()
50 self.builder.extend(b"```\n")
51 self.in_pre = True
52 case "blockquote":
53 append_newline()
54 self.builder.extend(b"> ")
55 case "strong" | "b":
56 self.builder.extend(b"**")
57 case "em" | "i":
58 self.builder.extend(b"*")
59 case "del" | "s":
60 self.builder.extend(b"~~")
61 case "br":
62 self.builder.extend(b"\n")
63 case _:
64 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
65 level = int(tag[1])
66 self.builder.extend(("\n" + "#" * level + " ").encode('utf-8'))
67
68 @override
69 def handle_endtag(self, tag: str) -> None:
70 if self.invisible:
71 if tag == "p":
72 self.invisible = False
73 return
74
75 match tag:
76 case "a":
77 if "a" in self._tag_stack:
78 self.handle_a_endtag()
79 case "code":
80 if not self.in_pre and self.in_code:
81 self.builder.extend(b"`")
82 self.in_code = False
83 case "pre":
84 self.builder.extend(b"\n```\n")
85 self.in_pre = False
86 case "blockquote":
87 self.builder.extend(b"\n")
88 case "strong" | "b":
89 self.builder.extend(b"**")
90 case "em" | "i":
91 self.builder.extend(b"*")
92 case "del" | "s":
93 self.builder.extend(b"~~")
94 case "p":
95 self.builder.extend(b"\n\n")
96 case _:
97 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
98 self.builder.extend(b'\n')
99
100 @override
101 def handle_data(self, data: str) -> None:
102 if not self.invisible:
103 self.builder.extend(data.encode('utf-8'))
104
105 def get_result(self) -> tuple[str, list[f.Fragment]]:
106 if self.builder.endswith(b'\n\n'):
107 return self.builder[:-2].decode('utf-8'), self.fragments
108 return self.builder.decode('utf-8'), self.fragments