social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2
3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
4from util.html import HTMLToTokensParser
5from util.splitter import canonical_label
6
7URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
8MD_INLINE_LINK = re.compile(
9 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
10 re.IGNORECASE,
11)
12MD_AUTOLINK = re.compile(
13 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
14)
15HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
16FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
17
18REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
19
20
21# TODO autolinks are broken by the html parser
22class MarkdownParser:
23 def parse(
24 self, text: str, tags: list[str], handles: list[tuple[str, str]]
25 ) -> list[Token]:
26 if not text:
27 return []
28
29 tokenizer = HTMLToTokensParser()
30 tokenizer.feed(text)
31 html_tokens = tokenizer.get_result()
32
33 tokens: list[Token] = []
34
35 for tk in html_tokens:
36 if isinstance(tk, TextToken):
37 tokens.extend(self.__tokenize_md(tk.text, tags, handles))
38 elif isinstance(tk, LinkToken):
39 if not tk.label or canonical_label(tk.label, tk.href):
40 tokens.append(tk)
41 continue
42
43 tokens.extend(
44 self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)
45 )
46 else:
47 tokens.append(tk)
48
49 return tokens
50
51 def __tokenize_md(
52 self, text: str, tags: list[str], handles: list[tuple[str, str]]
53 ) -> list[Token]:
54 index: int = 0
55 total: int = len(text)
56 buffer: list[str] = []
57
58 tokens: list[Token] = []
59
60 def flush():
61 nonlocal buffer
62 if buffer:
63 tokens.append(TextToken(text="".join(buffer)))
64 buffer = []
65
66 while index < total:
67 if text[index] == "[":
68 md_inline = MD_INLINE_LINK.match(text, index)
69 if md_inline:
70 flush()
71 label = md_inline.group(1)
72 href = md_inline.group(2)
73 tokens.append(LinkToken(href=href, label=label))
74 index = md_inline.end()
75 continue
76
77 if text[index] == "<":
78 md_auto = MD_AUTOLINK.match(text, index)
79 if md_auto:
80 flush()
81 href = md_auto.group(1)
82 tokens.append(LinkToken(href=href, label=None))
83 index = md_auto.end()
84 continue
85
86 if text[index] == "#":
87 tag = HASHTAG.match(text, index)
88 if tag:
89 tag_text = tag.group(1)
90 if tag_text.lower() in tags:
91 flush()
92 tokens.append(TagToken(tag=tag_text))
93 index = tag.end()
94 continue
95
96 if text[index] == "@":
97 handle = FEDIVERSE_HANDLE.match(text, index)
98 if handle:
99 handle_text = handle.group(0)
100 stripped_handle = handle_text.strip()
101
102 match = next(
103 (pair for pair in handles if stripped_handle in pair), None
104 )
105
106 if match:
107 flush()
108 tokens.append(
109 MentionToken(username=match[1], uri=None)
110 ) # TODO: misskey doesn’t provide a uri
111 index = handle.end()
112 continue
113
114 url = URL.match(text, index)
115 if url:
116 flush()
117 href = url.group(0)
118 tokens.append(LinkToken(href=href, label=None))
119 index = url.end()
120 continue
121
122 buffer.append(text[index])
123 index += 1
124
125 flush()
126 return tokens