social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2
3import cross
4import util.html_util as html_util
5import util.util as util
6
7URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
8MD_INLINE_LINK = re.compile(
9 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
10 re.IGNORECASE,
11)
12MD_AUTOLINK = re.compile(
13 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
14)
15HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
16FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
17
18
19def tokenize_markdown(
20 text: str, tags: list[str], handles: list[tuple[str, str]]
21) -> list[cross.Token]:
22 if not text:
23 return []
24
25 tokenizer = html_util.HTMLPostTokenizer()
26 tokenizer.mentions = handles
27 tokenizer.tags = tags
28 tokenizer.feed(text)
29 html_tokens = tokenizer.get_tokens()
30
31 tokens: list[cross.Token] = []
32
33 for tk in html_tokens:
34 if isinstance(tk, cross.TextToken):
35 tokens.extend(__tokenize_md(tk.text, tags, handles))
36 elif isinstance(tk, cross.LinkToken):
37 if not tk.label or util.canonical_label(tk.label, tk.href):
38 tokens.append(tk)
39 continue
40
41 tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles))
42 else:
43 tokens.append(tk)
44
45 return tokens
46
47
48def __tokenize_md(
49 text: str, tags: list[str], handles: list[tuple[str, str]]
50) -> list[cross.Token]:
51 index: int = 0
52 total: int = len(text)
53 buffer: list[str] = []
54
55 tokens: list[cross.Token] = []
56
57 def flush():
58 nonlocal buffer
59 if buffer:
60 tokens.append(cross.TextToken("".join(buffer)))
61 buffer = []
62
63 while index < total:
64 if text[index] == "[":
65 md_inline = MD_INLINE_LINK.match(text, index)
66 if md_inline:
67 flush()
68 label = md_inline.group(1)
69 href = md_inline.group(2)
70 tokens.append(cross.LinkToken(href, label))
71 index = md_inline.end()
72 continue
73
74 if text[index] == "<":
75 md_auto = MD_AUTOLINK.match(text, index)
76 if md_auto:
77 flush()
78 href = md_auto.group(1)
79 tokens.append(cross.LinkToken(href, href))
80 index = md_auto.end()
81 continue
82
83 if text[index] == "#":
84 tag = HASHTAG.match(text, index)
85 if tag:
86 tag_text = tag.group(1)
87 if tag_text.lower() in tags:
88 flush()
89 tokens.append(cross.TagToken(tag_text))
90 index = tag.end()
91 continue
92
93 if text[index] == "@":
94 handle = FEDIVERSE_HANDLE.match(text, index)
95 if handle:
96 handle_text = handle.group(0)
97 stripped_handle = handle_text.strip()
98
99 match = next(
100 (pair for pair in handles if stripped_handle in pair), None
101 )
102
103 if match:
104 flush()
105 tokens.append(
106 cross.MentionToken(match[1], "")
107 ) # TODO: misskey doesn’t provide a uri
108 index = handle.end()
109 continue
110
111 url = URL.match(text, index)
112 if url:
113 flush()
114 href = url.group(0)
115 tokens.append(cross.LinkToken(href, href))
116 index = url.end()
117 continue
118
119 buffer.append(text[index])
120 index += 1
121
122 flush()
123 return tokens