social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from html.parser import HTMLParser
2from typing import override
3import cross.fragments as f
4
5
6class HTMLToFragmentsParser(HTMLParser):
7 def __init__(self) -> None:
8 super().__init__()
9 self.text: str = ""
10 self.fragments: list[f.Fragment] = []
11
12 self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}
13 self.in_pre: bool = False
14 self.in_code: bool = False
15
16 self.invisible: bool = False
17
18 def handle_a_endtag(self):
19 current_end = len(self.text)
20 start, _attr = self._tag_stack.pop("a")
21
22 href = _attr.get('href')
23 if href and current_end > start:
24 self.fragments.append(
25 f.LinkFragment(start=start, end=current_end, url=href)
26 )
27
28 @override
29 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
30 _attr = dict(attrs)
31
32 def append_newline():
33 if self.text and not self.text.endswith("\n"):
34 self.text += "\n"
35
36 if self.invisible:
37 return
38
39 match tag:
40 case "p":
41 cls = _attr.get('class', '')
42 if cls and 'quote-inline' in cls:
43 self.invisible = True
44 case "a":
45 self._tag_stack["a"] = (len(self.text), _attr)
46 case "code":
47 if not self.in_pre:
48 self.text += "`"
49 self.in_code = True
50 case "pre":
51 append_newline()
52 self.text += "```\n"
53 self.in_pre = True
54 case "blockquote":
55 append_newline()
56 self.text += "> "
57 case "strong" | "b":
58 self.text += "**"
59 case "em" | "i":
60 self.text += "*"
61 case "del" | "s":
62 self.text += "~~"
63 case "br":
64 self.text += "\n"
65 case _:
66 if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
67 level = int(tag[1])
68 self.text += "\n" + "#" * level + " "
69
70 @override
71 def handle_endtag(self, tag: str) -> None:
72 if self.invisible:
73 if tag == "p":
74 self.invisible = False
75 return
76
77 match tag:
78 case "a":
79 if "a" in self._tag_stack:
80 self.handle_a_endtag()
81 case "code":
82 if not self.in_pre and self.in_code:
83 self.text += "`"
84 self.in_code = False
85 case "pre":
86 self.text += "\n```\n"
87 self.in_pre = False
88 case "blockquote":
89 self.text += "\n"
90 case "strong" | "b":
91 self.text += "**"
92 case "em" | "i":
93 self.text += "*"
94 case "del" | "s":
95 self.text += "~~"
96 case "p":
97 self.text += "\n\n"
98 case _:
99 if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
100 self.text += '\n'
101
102 @override
103 def handle_data(self, data: str) -> None:
104 if not self.invisible:
105 self.text += data
106
107 def get_result(self) -> tuple[str, list[f.Fragment]]:
108 if self.text.endswith('\n\n'):
109 return self.text[:-2], self.fragments
110 return self.text, self.fragments