social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2
3import cross
4import util.html_util as html_util
5import util.util as util
6
7URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE)
8MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE)
9MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE)
10HASHTAG = re.compile(r'(?<!\w)\#([\w]+)')
11FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?')
12
13def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]:
14 if not text:
15 return []
16
17 tokenizer = html_util.HTMLPostTokenizer()
18 tokenizer.mentions = handles
19 tokenizer.tags = tags
20 tokenizer.feed(text)
21 html_tokens = tokenizer.get_tokens()
22
23 tokens: list[cross.Token] = []
24
25 for tk in html_tokens:
26 if isinstance(tk, cross.TextToken):
27 tokens.extend(__tokenize_md(tk.text, tags, handles))
28 elif isinstance(tk, cross.LinkToken):
29 if not tk.label or util.canonical_label(tk.label, tk.href):
30 tokens.append(tk)
31 continue
32
33 tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles))
34 else:
35 tokens.append(tk)
36
37 return tokens
38
39
40def __tokenize_md(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]:
41 index: int = 0
42 total: int = len(text)
43 buffer: list[str] = []
44
45 tokens: list[cross.Token] = []
46
47 def flush():
48 nonlocal buffer
49 if buffer:
50 tokens.append(cross.TextToken(''.join(buffer)))
51 buffer = []
52
53 while index < total:
54 if text[index] == '[':
55 md_inline = MD_INLINE_LINK.match(text, index)
56 if md_inline:
57 flush()
58 label = md_inline.group(1)
59 href = md_inline.group(2)
60 tokens.append(cross.LinkToken(href, label))
61 index = md_inline.end()
62 continue
63
64 if text[index] == '<':
65 md_auto = MD_AUTOLINK.match(text, index)
66 if md_auto:
67 flush()
68 href = md_auto.group(1)
69 tokens.append(cross.LinkToken(href, href))
70 index = md_auto.end()
71 continue
72
73 if text[index] == '#':
74 tag = HASHTAG.match(text, index)
75 if tag:
76 tag_text = tag.group(1)
77 if tag_text.lower() in tags:
78 flush()
79 tokens.append(cross.TagToken(tag_text))
80 index = tag.end()
81 continue
82
83 if text[index] == '@':
84 handle = FEDIVERSE_HANDLE.match(text, index)
85 if handle:
86 handle_text = handle.group(0)
87 stripped_handle = handle_text.strip()
88
89 match = next(
90 (pair for pair in handles if stripped_handle in pair),
91 None
92 )
93
94 if match:
95 flush()
96 tokens.append(cross.MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri
97 index = handle.end()
98 continue
99
100 url = URL.match(text, index)
101 if url:
102 flush()
103 href = url.group(0)
104 tokens.append(cross.LinkToken(href, href))
105 index = url.end()
106 continue
107
108 buffer.append(text[index])
109 index += 1
110
111 flush()
112 return tokens