import re
import cross
import util.html_util as html_util
import util.util as util
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
MD_INLINE_LINK = re.compile(
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
re.IGNORECASE,
)
MD_AUTOLINK = re.compile(
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
)
HASHTAG = re.compile(r"(? list[cross.Token]:
if not text:
return []
tokenizer = html_util.HTMLPostTokenizer()
tokenizer.mentions = handles
tokenizer.tags = tags
tokenizer.feed(text)
html_tokens = tokenizer.get_tokens()
tokens: list[cross.Token] = []
for tk in html_tokens:
if isinstance(tk, cross.TextToken):
tokens.extend(__tokenize_md(tk.text, tags, handles))
elif isinstance(tk, cross.LinkToken):
if not tk.label or util.canonical_label(tk.label, tk.href):
tokens.append(tk)
continue
tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles))
else:
tokens.append(tk)
return tokens
def __tokenize_md(
text: str, tags: list[str], handles: list[tuple[str, str]]
) -> list[cross.Token]:
index: int = 0
total: int = len(text)
buffer: list[str] = []
tokens: list[cross.Token] = []
def flush():
nonlocal buffer
if buffer:
tokens.append(cross.TextToken("".join(buffer)))
buffer = []
while index < total:
if text[index] == "[":
md_inline = MD_INLINE_LINK.match(text, index)
if md_inline:
flush()
label = md_inline.group(1)
href = md_inline.group(2)
tokens.append(cross.LinkToken(href, label))
index = md_inline.end()
continue
if text[index] == "<":
md_auto = MD_AUTOLINK.match(text, index)
if md_auto:
flush()
href = md_auto.group(1)
tokens.append(cross.LinkToken(href, href))
index = md_auto.end()
continue
if text[index] == "#":
tag = HASHTAG.match(text, index)
if tag:
tag_text = tag.group(1)
if tag_text.lower() in tags:
flush()
tokens.append(cross.TagToken(tag_text))
index = tag.end()
continue
if text[index] == "@":
handle = FEDIVERSE_HANDLE.match(text, index)
if handle:
handle_text = handle.group(0)
stripped_handle = handle_text.strip()
match = next(
(pair for pair in handles if stripped_handle in pair), None
)
if match:
flush()
tokens.append(
cross.MentionToken(match[1], "")
) # TODO: misskey doesn’t provide a uri
index = handle.end()
continue
url = URL.match(text, index)
if url:
flush()
href = url.group(0)
tokens.append(cross.LinkToken(href, href))
index = url.end()
continue
buffer.append(text[index])
index += 1
flush()
return tokens