social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2import cross.fragments as f
3from util.html import HTMLToFragmentsParser
4
5URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
6MD_INLINE_LINK = re.compile(
7 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
8 re.IGNORECASE,
9)
10MD_AUTOLINK = re.compile(
11 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
12)
13HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
14FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
15
16REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
17
18
19# TODO autolinks are broken by the html parser
20class MarkdownParser:
21 def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
22 if not text:
23 return "", []
24
25 html_parser = HTMLToFragmentsParser()
26 html_parser.feed(text)
27 markdown, fragments = html_parser.get_result()
28
29 index: int = 0
30 total: int = len(markdown)
31
32 # no match == processed fragments
33 events: list[tuple[int, int, re.Match[str] | None, str]] = []
34 events.extend([(fg.start, fg.end, None, "html") for fg in fragments])
35 while index < total:
36 ch = markdown[index]
37 rmatch = None
38 kind = None
39
40 if ch == "[":
41 rmatch = MD_INLINE_LINK.match(markdown, index)
42 kind = "inline_link"
43 # elif ch == '<':
44 # rmatch = MD_AUTOLINK.match(markdown, index)
45 # kind = "autolink"
46 elif ch == "#":
47 rmatch = HASHTAG.match(markdown, index)
48 kind = "hashtag"
49 elif ch == "@":
50 rmatch = FEDIVERSE_HANDLE.match(markdown, index)
51 kind = "mention"
52 else:
53 rmatch = URL.match(markdown, index)
54 kind = "url"
55
56 if rmatch:
57 start, end = rmatch.start(), rmatch.end()
58 if end == index:
59 index += 1
60 continue
61 events.append((start, end, rmatch, kind))
62 index = end
63 continue
64
65 index += 1
66
67 events.sort(key=lambda x: x[0])
68
69 # validate fragment positions
70 last_end: int = 0
71 for start, end, _, _ in events:
72 if start > end:
73 raise Exception(f"Invalid fragment position start={start}, end={end}")
74 if last_end > start:
75 raise Exception(
76 f"Overlapping text fragments at position end={last_end}, start={start}"
77 )
78 last_end = end
79
80 def update_fragments(start: int, s, offset: int):
81 nonlocal fragments
82
83 for fg in fragments:
84 if fg != s and fg.start >= start:
85 fg.start += offset
86 fg.end += offset
87
88 new_text = ""
89 last_pos = 0
90 for start, end, rmatch, event in events:
91 if start > last_pos:
92 new_text += markdown[last_pos:start]
93
94 if not rmatch:
95 new_text += markdown[start:end]
96 last_pos = end
97 continue
98
99 match event:
100 case "inline_link":
101 label = rmatch.group(1)
102 href = rmatch.group(2)
103 fg = f.LinkFragment(start=start, end=start + len(label), url=href)
104 fragments.append(fg)
105 update_fragments(start, fg, -(end - (start + len(label))))
106 new_text += label
107 # case "autolink":
108 # url = rmatch.group(0)
109 # fg = f.LinkFragment(start=start, end=end - 2, url=url)
110 # fragments.append(fg)
111 # update_fragments(start, fg, -2)
112 # new_text += url
113 case "hashtag":
114 tag = rmatch.group(0)
115 fragments.append(
116 f.TagFragment(
117 start=start,
118 end=end,
119 tag=tag[1:] if tag.startswith("#") else tag,
120 )
121 )
122 new_text += markdown[start:end]
123 case "mention":
124 mention = rmatch.group(0)
125 fragments.append(
126 f.MentionFragment(
127 start=start,
128 end=end,
129 uri=mention[1:] if mention.startswith("@") else mention,
130 )
131 )
132 new_text += markdown[start:end]
133 case "url":
134 url = rmatch.group(0)
135 fragments.append(f.LinkFragment(start=start, end=end, url=url))
136 new_text += markdown[start:end]
137 case _:
138 pass
139 last_pos = end
140 if last_pos < len(markdown):
141 new_text += markdown[last_pos:]
142
143 return new_text, fragments