util/markdown.py at eda2549322ddcd09f89d4a406795521b5f3d2e98 · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util / markdown.py
at eda2549322ddcd09f89d4a406795521b5f3d2e98 5.4 kB view raw
  1from dataclasses import replace
  2import re
  3import cross.fragments as f
  4from util.html import HTMLToFragmentsParser
  5
  6URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
  7MD_INLINE_LINK = re.compile(
  8    rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
  9    re.IGNORECASE,
 10)
 11MD_AUTOLINK = re.compile(
 12    rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
 13)
 14HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
 15FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
 16
 17REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
 18
 19
 20# TODO autolinks are broken by the html parser
 21class MarkdownParser:
 22    def parse(self, text: str) -> tuple[str, list[f.Fragment]]:
 23        if not text:
 24            return "", []
 25
 26        html_parser = HTMLToFragmentsParser()
 27        html_parser.feed(text)
 28        markdown, fragments = html_parser.get_result()
 29
 30        markdown_bytes: bytes = markdown.encode("utf-8")
 31
 32        index: int = 0
 33        total: int = len(markdown_bytes)
 34
 35        events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
 36        events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
 37
 38        while index < total:
 39            ch: int = markdown_bytes[index]
 40            rmatch: re.Match[bytes] | None = None
 41            kind = None
 42
 43            if ch == b"["[0]:
 44                rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
 45                kind = "inline_link"
 46            # elif ch == b"<"[0]:
 47            #     rmatch = MD_AUTOLINK.match(markdown_bytes, index)
 48            #     kind = "autolink"
 49            elif ch == b"#"[0]:
 50                rmatch = HASHTAG.match(markdown_bytes, index)
 51                kind = "hashtag"
 52            elif ch == b"@"[0]:
 53                rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
 54                kind = "mention"
 55            else:
 56                rmatch = URL.match(markdown_bytes, index)
 57                kind = "url"
 58
 59            if rmatch:
 60                start, end = rmatch.start(), rmatch.end()
 61                if end == index:
 62                    index += 1
 63                    continue
 64                events.append((start, end, rmatch, kind))
 65                index = end
 66                continue
 67
 68            index += 1
 69
 70        events.sort(key=lambda x: x[0])
 71
 72        last_end: int = 0
 73        for start, end, _, _ in events:
 74            if start > end:
 75                raise Exception(f"Invalid fragment position start={start}, end={end}")
 76            if last_end > start:
 77                raise Exception(
 78                    f"Overlapping text fragments at position end={last_end}, start={start}"
 79                )
 80            last_end = end
 81
 82        ntext: bytearray = bytearray()
 83        nfragments: list[f.Fragment] = []
 84
 85        offset: int = 0
 86        last_index: int = 0
 87
 88        for start, end, rmatch, event in events:
 89            ntext.extend(markdown_bytes[last_index:start])
 90
 91            if isinstance(rmatch, f.Fragment):
 92                ntext.extend(markdown_bytes[start:end])
 93                nfg = replace(rmatch, start=start + offset, end=end + offset)
 94                nfragments.append(nfg)
 95                last_index = end
 96                continue
 97
 98            nstart = start + offset
 99            match event:
100                case "inline_link":
101                    label_bytes: bytes = rmatch.group(1)
102                    href_bytes: bytes = rmatch.group(2)
103
104                    ntext.extend(label_bytes)
105
106                    delta = len(label_bytes) - (end - start)
107                    offset += delta
108
109                    nend = nstart + len(label_bytes)
110                    nfragments.append(
111                        f.LinkFragment(
112                            start=nstart, end=nend, url=href_bytes.decode("utf-8")
113                        )
114                    )
115
116                case "hashtag":
117                    tag_bytes: bytes = rmatch.group(1)
118                    ntext.extend(markdown_bytes[start:end])
119                    nend = end + offset
120                    nfragments.append(
121                        f.TagFragment(
122                            start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
123                        )
124                    )
125
126                case "mention":
127                    mention_bytes: bytes = rmatch.group(0)
128                    ntext.extend(markdown_bytes[start:end])
129
130                    mention_str = mention_bytes.decode("utf-8")
131                    mention_str = (
132                        mention_str[1:] if mention_str.startswith("@") else mention_str
133                    )
134
135                    nend = end + offset
136                    nfragments.append(
137                        f.MentionFragment(start=nstart, end=nend, uri=mention_str)
138                    )
139
140                case "url":
141                    url_bytes: bytes = rmatch.group(0)
142                    ntext.extend(markdown_bytes[start:end])
143                    nend = end + offset
144                    nfragments.append(
145                        f.LinkFragment(
146                            start=nstart, end=nend, url=url_bytes.decode("utf-8")
147                        )
148                    )
149
150                case _:
151                    pass
152            last_index = end
153
154        ntext.extend(markdown_bytes[last_index:])
155
156        return ntext.decode("utf-8"), nfragments