import re import cross import util.html_util as html_util import util.util as util URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) MD_INLINE_LINK = re.compile( r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE, ) MD_AUTOLINK = re.compile( r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE ) HASHTAG = re.compile(r"(? list[cross.Token]: if not text: return [] tokenizer = html_util.HTMLPostTokenizer() tokenizer.mentions = handles tokenizer.tags = tags tokenizer.feed(text) html_tokens = tokenizer.get_tokens() tokens: list[cross.Token] = [] for tk in html_tokens: if isinstance(tk, cross.TextToken): tokens.extend(__tokenize_md(tk.text, tags, handles)) elif isinstance(tk, cross.LinkToken): if not tk.label or util.canonical_label(tk.label, tk.href): tokens.append(tk) continue tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)) else: tokens.append(tk) return tokens def __tokenize_md( text: str, tags: list[str], handles: list[tuple[str, str]] ) -> list[cross.Token]: index: int = 0 total: int = len(text) buffer: list[str] = [] tokens: list[cross.Token] = [] def flush(): nonlocal buffer if buffer: tokens.append(cross.TextToken("".join(buffer))) buffer = [] while index < total: if text[index] == "[": md_inline = MD_INLINE_LINK.match(text, index) if md_inline: flush() label = md_inline.group(1) href = md_inline.group(2) tokens.append(cross.LinkToken(href, label)) index = md_inline.end() continue if text[index] == "<": md_auto = MD_AUTOLINK.match(text, index) if md_auto: flush() href = md_auto.group(1) tokens.append(cross.LinkToken(href, href)) index = md_auto.end() continue if text[index] == "#": tag = HASHTAG.match(text, index) if tag: tag_text = tag.group(1) if tag_text.lower() in tags: flush() tokens.append(cross.TagToken(tag_text)) index = tag.end() continue if text[index] == "@": handle = FEDIVERSE_HANDLE.match(text, index) if handle: handle_text = handle.group(0) stripped_handle = handle_text.strip() match = next( (pair for pair in handles if stripped_handle in pair), None ) if match: flush() tokens.append( cross.MentionToken(match[1], "") ) # TODO: misskey doesn’t provide a uri index = handle.end() continue url = URL.match(text, index) if url: flush() href = url.group(0) tokens.append(cross.LinkToken(href, href)) index = url.end() continue buffer.append(text[index]) index += 1 flush() return tokens