util/markdown.py at next · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util / markdown.py
at next 4.2 kB view raw
  1import re
  2
  3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
  4from util.html import HTMLToTokensParser
  5from util.splitter import canonical_label
  6
  7URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
  8MD_INLINE_LINK = re.compile(
  9    r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
 10    re.IGNORECASE,
 11)
 12MD_AUTOLINK = re.compile(
 13    r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
 14)
 15HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
 16FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
 17
 18REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
 19
 20
 21# TODO autolinks are broken by the html parser
 22class MarkdownParser:
 23    def parse(
 24        self, text: str, tags: list[str], handles: list[tuple[str, str]]
 25    ) -> list[Token]:
 26        if not text:
 27            return []
 28
 29        tokenizer = HTMLToTokensParser()
 30        tokenizer.feed(text)
 31        html_tokens = tokenizer.get_result()
 32
 33        tokens: list[Token] = []
 34
 35        for tk in html_tokens:
 36            if isinstance(tk, TextToken):
 37                tokens.extend(self.__tokenize_md(tk.text, tags, handles))
 38            elif isinstance(tk, LinkToken):
 39                if not tk.label or canonical_label(tk.label, tk.href):
 40                    tokens.append(tk)
 41                    continue
 42
 43                tokens.extend(
 44                    self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)
 45                )
 46            else:
 47                tokens.append(tk)
 48
 49        return tokens
 50
 51    def __tokenize_md(
 52        self, text: str, tags: list[str], handles: list[tuple[str, str]]
 53    ) -> list[Token]:
 54        index: int = 0
 55        total: int = len(text)
 56        buffer: list[str] = []
 57
 58        tokens: list[Token] = []
 59
 60        def flush():
 61            nonlocal buffer
 62            if buffer:
 63                tokens.append(TextToken(text="".join(buffer)))
 64                buffer = []
 65
 66        while index < total:
 67            if text[index] == "[":
 68                md_inline = MD_INLINE_LINK.match(text, index)
 69                if md_inline:
 70                    flush()
 71                    label = md_inline.group(1)
 72                    href = md_inline.group(2)
 73                    tokens.append(LinkToken(href=href, label=label))
 74                    index = md_inline.end()
 75                    continue
 76
 77            if text[index] == "<":
 78                md_auto = MD_AUTOLINK.match(text, index)
 79                if md_auto:
 80                    flush()
 81                    href = md_auto.group(1)
 82                    tokens.append(LinkToken(href=href, label=None))
 83                    index = md_auto.end()
 84                    continue
 85
 86            if text[index] == "#":
 87                tag = HASHTAG.match(text, index)
 88                if tag:
 89                    tag_text = tag.group(1)
 90                    if tag_text.lower() in tags:
 91                        flush()
 92                        tokens.append(TagToken(tag=tag_text))
 93                        index = tag.end()
 94                        continue
 95
 96            if text[index] == "@":
 97                handle = FEDIVERSE_HANDLE.match(text, index)
 98                if handle:
 99                    handle_text = handle.group(0)
100                    stripped_handle = handle_text.strip()
101
102                    match = next(
103                        (pair for pair in handles if stripped_handle in pair), None
104                    )
105
106                    if match:
107                        flush()
108                        tokens.append(
109                            MentionToken(username=match[1], uri=None)
110                        )  # TODO: misskey doesn’t provide a uri
111                        index = handle.end()
112                        continue
113
114            url = URL.match(text, index)
115            if url:
116                flush()
117                href = url.group(0)
118                tokens.append(LinkToken(href=href, label=None))
119                index = url.end()
120                continue
121
122            buffer.append(text[index])
123            index += 1
124
125        flush()
126        return tokens