import re from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token from util.html import HTMLToTokensParser from util.splitter import canonical_label URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) MD_INLINE_LINK = re.compile( r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE, ) MD_AUTOLINK = re.compile( r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE ) HASHTAG = re.compile(r"(? list[Token]: if not text: return [] tokenizer = HTMLToTokensParser() tokenizer.feed(text) html_tokens = tokenizer.get_result() tokens: list[Token] = [] for tk in html_tokens: if isinstance(tk, TextToken): tokens.extend(self.__tokenize_md(tk.text, tags, handles)) elif isinstance(tk, LinkToken): if not tk.label or canonical_label(tk.label, tk.href): tokens.append(tk) continue tokens.extend( self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles) ) else: tokens.append(tk) return tokens def __tokenize_md( self, text: str, tags: list[str], handles: list[tuple[str, str]] ) -> list[Token]: index: int = 0 total: int = len(text) buffer: list[str] = [] tokens: list[Token] = [] def flush(): nonlocal buffer if buffer: tokens.append(TextToken(text="".join(buffer))) buffer = [] while index < total: if text[index] == "[": md_inline = MD_INLINE_LINK.match(text, index) if md_inline: flush() label = md_inline.group(1) href = md_inline.group(2) tokens.append(LinkToken(href=href, label=label)) index = md_inline.end() continue if text[index] == "<": md_auto = MD_AUTOLINK.match(text, index) if md_auto: flush() href = md_auto.group(1) tokens.append(LinkToken(href=href, label=None)) index = md_auto.end() continue if text[index] == "#": tag = HASHTAG.match(text, index) if tag: tag_text = tag.group(1) if tag_text.lower() in tags: flush() tokens.append(TagToken(tag=tag_text)) index = tag.end() continue if text[index] == "@": handle = FEDIVERSE_HANDLE.match(text, index) if handle: handle_text = handle.group(0) stripped_handle = handle_text.strip() match = next( (pair for pair in handles if stripped_handle in pair), None ) if match: flush() tokens.append( MentionToken(username=match[1], uri=None) ) # TODO: misskey doesn’t provide a uri index = handle.end() continue url = URL.match(text, index) if url: flush() href = url.group(0) tokens.append(LinkToken(href=href, label=None)) index = url.end() continue buffer.append(text[index]) index += 1 flush() return tokens