social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from dataclasses import replace
2import re
3import cross.fragments as f
4from util.html import HTMLToFragmentsParser
5
6URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
7MD_INLINE_LINK = re.compile(
8 rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
9 re.IGNORECASE,
10)
11MD_AUTOLINK = re.compile(
12 rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
13)
14HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
15FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
16
17REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
18
19
20# TODO autolinks are broken by the html parser
21class MarkdownParser:
22 def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
23 if not text:
24 return "", []
25
26 html_parser = HTMLToFragmentsParser()
27 html_parser.feed(text)
28 markdown, fragments = html_parser.get_result()
29
30 markdown_bytes: bytes = markdown.encode("utf-8")
31
32 index: int = 0
33 total: int = len(markdown_bytes)
34
35 events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
36 events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
37
38 while index < total:
39 ch: int = markdown_bytes[index]
40 rmatch: re.Match[bytes] | None = None
41 kind = None
42
43 if ch == b"["[0]:
44 rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
45 kind = "inline_link"
46 # elif ch == b"<"[0]:
47 # rmatch = MD_AUTOLINK.match(markdown_bytes, index)
48 # kind = "autolink"
49 elif ch == b"#"[0]:
50 rmatch = HASHTAG.match(markdown_bytes, index)
51 kind = "hashtag"
52 elif ch == b"@"[0]:
53 rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
54 kind = "mention"
55 else:
56 rmatch = URL.match(markdown_bytes, index)
57 kind = "url"
58
59 if rmatch:
60 start, end = rmatch.start(), rmatch.end()
61 if end == index:
62 index += 1
63 continue
64 events.append((start, end, rmatch, kind))
65 index = end
66 continue
67
68 index += 1
69
70 events.sort(key=lambda x: x[0])
71
72 last_end: int = 0
73 for start, end, _, _ in events:
74 if start > end:
75 raise Exception(f"Invalid fragment position start={start}, end={end}")
76 if last_end > start:
77 raise Exception(
78 f"Overlapping text fragments at position end={last_end}, start={start}"
79 )
80 last_end = end
81
82 ntext: bytearray = bytearray()
83 nfragments: list[f.Fragment] = []
84
85 offset: int = 0
86 last_index: int = 0
87
88 for start, end, rmatch, event in events:
89 ntext.extend(markdown_bytes[last_index:start])
90
91 if isinstance(rmatch, f.Fragment):
92 ntext.extend(markdown_bytes[start:end])
93 nfg = replace(rmatch, start=start + offset, end=end + offset)
94 nfragments.append(nfg)
95 last_index = end
96 continue
97
98 nstart = start + offset
99 match event:
100 case "inline_link":
101 label_bytes: bytes = rmatch.group(1)
102 href_bytes: bytes = rmatch.group(2)
103
104 ntext.extend(label_bytes)
105
106 delta = len(label_bytes) - (end - start)
107 offset += delta
108
109 nend = nstart + len(label_bytes)
110 nfragments.append(
111 f.LinkFragment(
112 start=nstart, end=nend, url=href_bytes.decode("utf-8")
113 )
114 )
115
116 case "hashtag":
117 tag_bytes: bytes = rmatch.group(1)
118 ntext.extend(markdown_bytes[start:end])
119 nend = end + offset
120 nfragments.append(
121 f.TagFragment(
122 start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
123 )
124 )
125
126 case "mention":
127 mention_bytes: bytes = rmatch.group(0)
128 ntext.extend(markdown_bytes[start:end])
129
130 mention_str = mention_bytes.decode("utf-8")
131 mention_str = (
132 mention_str[1:] if mention_str.startswith("@") else mention_str
133 )
134
135 nend = end + offset
136 nfragments.append(
137 f.MentionFragment(start=nstart, end=nend, uri=mention_str)
138 )
139
140 case "url":
141 url_bytes: bytes = rmatch.group(0)
142 ntext.extend(markdown_bytes[start:end])
143 nend = end + offset
144 nfragments.append(
145 f.LinkFragment(
146 start=nstart, end=nend, url=url_bytes.decode("utf-8")
147 )
148 )
149
150 case _:
151 pass
152 last_index = end
153
154 ntext.extend(markdown_bytes[last_index:])
155
156 return ntext.decode("utf-8"), nfragments