social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2from typing import override
3import cross.fragments as f
4
5class HTMLToFragmentsParser(HTMLParser):
6 def __init__(self) -> None:
7 super().__init__()
8 self.builder: bytearray = bytearray()
9 self.fragments: list[f.Fragment] = []
10
11 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
12 self.in_pre: bool = False
13 self.in_code: bool = False
14 self.invisible: bool = False
15
16 def handle_a_endtag(self):
17 current_end = len(self.builder)
18 start, _attr = self._tag_stack.pop("a")
19
20 href = _attr.get('href')
21 if href and current_end > start:
22 self.fragments.append(
23 f.LinkFragment(start=start, end=current_end, url=href)
24 )
25
26 def append_newline(self):
27 if self.builder and not self.builder.endswith(b"\n"):
28 self.builder.extend(b"\n")
29
30 @override
31 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
32 _attr = dict(attrs)
33
34 if self.invisible:
35 return
36
37 match tag:
38 case "p":
39 cls = _attr.get('class', '')
40 if cls and 'quote-inline' in cls:
41 self.invisible = True
42 case "a":
43 self._tag_stack["a"] = (len(self.builder), _attr)
44 case "code":
45 if not self.in_pre:
46 self.builder.extend(b"`")
47 self.in_code = True
48 case "pre":
49 self.append_newline()
50 self.builder.extend(b"```\n")
51 self.in_pre = True
52 case "blockquote":
53 self.append_newline()
54 self.builder.extend(b"> ")
55 case "strong" | "b":
56 self.builder.extend(b"**")
57 case "em" | "i":
58 self.builder.extend(b"*")
59 case "del" | "s":
60 self.builder.extend(b"~~")
61 case "br":
62 self.builder.extend(b"\n")
63 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
64 level = int(tag[1])
65 self.builder.extend(("\n" + "#" * level + " ").encode('utf-8'))
66 case _:
67 #self.builder.extend(f"<{tag}>".encode("utf-8"))
68 pass
69
70
71 @override
72 def handle_endtag(self, tag: str) -> None:
73 if self.invisible:
74 if tag == "p":
75 self.invisible = False
76 return
77
78 match tag:
79 case "a":
80 if "a" in self._tag_stack:
81 self.handle_a_endtag()
82 case "code":
83 if not self.in_pre and self.in_code:
84 self.builder.extend(b"`")
85 self.in_code = False
86 case "pre":
87 self.append_newline()
88 self.builder.extend(b"```\n")
89 self.in_pre = False
90 case "blockquote":
91 self.builder.extend(b"\n")
92 case "strong" | "b":
93 self.builder.extend(b"**")
94 case "em" | "i":
95 self.builder.extend(b"*")
96 case "del" | "s":
97 self.builder.extend(b"~~")
98 case "p":
99 self.builder.extend(b"\n\n")
100 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
101 self.builder.extend(b'\n')
102 case _:
103 #self.builder.extend(f"</{tag}>".encode("utf-8"))
104 pass
105
106 @override
107 def handle_data(self, data: str) -> None:
108 if not self.invisible:
109 self.builder.extend(data.encode('utf-8'))
110
111 def get_result(self) -> tuple[str, list[f.Fragment]]:
112 if self.builder.endswith(b'\n\n'):
113 return self.builder[:-2].decode('utf-8'), self.fragments
114 if self.builder.endswith(b'\n'):
115 return self.builder[:-1].decode('utf-8'), self.fragments
116 return self.builder.decode('utf-8'), self.fragments