social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from dataclasses import replace
2import re
3import cross.fragments as f
4from util.html import HTMLToFragmentsParser
5
6URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
7MD_INLINE_LINK = re.compile(
8 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
9 re.IGNORECASE,
10)
11MD_AUTOLINK = re.compile(
12 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
13)
14HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
15FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
16
17REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
18
19
20# TODO autolinks are broken by the html parser
21class MarkdownParser:
22 def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
23 if not text:
24 return "", []
25
26 html_parser = HTMLToFragmentsParser()
27 html_parser.feed(text)
28 markdown, fragments = html_parser.get_result()
29
30 index: int = 0
31 total: int = len(markdown)
32
33 # no match == processed fragments
34 events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = []
35 events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
36 while index < total:
37 ch = markdown[index]
38 rmatch = None
39 kind = None
40
41 if ch == "[":
42 rmatch = MD_INLINE_LINK.match(markdown, index)
43 kind = "inline_link"
44 # elif ch == '<':
45 # rmatch = MD_AUTOLINK.match(markdown, index)
46 # kind = "autolink"
47 elif ch == "#":
48 rmatch = HASHTAG.match(markdown, index)
49 kind = "hashtag"
50 elif ch == "@":
51 rmatch = FEDIVERSE_HANDLE.match(markdown, index)
52 kind = "mention"
53 else:
54 rmatch = URL.match(markdown, index)
55 kind = "url"
56
57 if rmatch:
58 start, end = rmatch.start(), rmatch.end()
59 if end == index:
60 index += 1
61 continue
62 events.append((start, end, rmatch, kind))
63 index = end
64 continue
65
66 index += 1
67
68 events.sort(key=lambda x: x[0])
69
70 # validate fragment positions
71 last_end: int = 0
72 for start, end, _, _ in events:
73 if start > end:
74 raise Exception(f"Invalid fragment position start={start}, end={end}")
75 if last_end > start:
76 raise Exception(
77 f"Overlapping text fragments at position end={last_end}, start={start}"
78 )
79 last_end = end
80
81 ntext: list[str] = []
82 nfragments: list[f.Fragment] = []
83
84 offset: int = 0
85 last_index: int = 0
86
87 events.sort(key=lambda x: x[0])
88 for start, end, rmatch, event in events:
89 ntext.append(markdown[last_index:start])
90
91 if isinstance(rmatch, f.Fragment):
92 ntext.append(markdown[start:end])
93 nfg = replace(rmatch, start=start + offset, end=end + offset)
94 nfragments.append(nfg)
95 last_index = end
96 continue
97
98 nstart = start + offset
99 nend = end + offset
100 match event:
101 case "inline_link":
102 label = rmatch.group(1)
103 href = rmatch.group(2)
104 ntext.append(label)
105
106 delta = len(label) - (end - start)
107 offset += delta
108
109 nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href))
110 case "hashtag":
111 tag = rmatch.group(1)
112 ntext.append(markdown[start:end])
113 nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
114 case "mention":
115 mention = rmatch.group(0)
116 ntext.append(markdown[start:end])
117 mention = mention[1:] if mention.startswith("@") else mention
118 nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention))
119 case "url":
120 url = rmatch.group(0)
121 ntext.append(markdown[start:end])
122 nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
123 case _:
124 pass
125 last_index = end
126 ntext.append(markdown[last_index:])
127
128 return ''.join(ntext), nfragments