social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from dataclasses import replace 2import re 3import cross.fragments as f 4from util.html import HTMLToFragmentsParser 5 6URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 7MD_INLINE_LINK = re.compile( 8 rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 9 re.IGNORECASE, 10) 11MD_AUTOLINK = re.compile( 12 rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 13) 14HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)") 15FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 16 17REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 18 19 20# TODO autolinks are broken by the html parser 21class MarkdownParser: 22 def parse(self, text: str) -> tuple[str, list[f.Fragment]]: 23 if not text: 24 return "", [] 25 26 html_parser = HTMLToFragmentsParser() 27 html_parser.feed(text) 28 markdown, fragments = html_parser.get_result() 29 30 markdown_bytes: bytes = markdown.encode("utf-8") 31 32 index: int = 0 33 total: int = len(markdown_bytes) 34 35 events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = [] 36 events.extend([(fg.start, fg.end, fg, "html") for fg in fragments]) 37 38 while index < total: 39 ch: int = markdown_bytes[index] 40 rmatch: re.Match[bytes] | None = None 41 kind = None 42 43 if ch == b"["[0]: 44 rmatch = MD_INLINE_LINK.match(markdown_bytes, index) 45 kind = "inline_link" 46 # elif ch == b"<"[0]: 47 # rmatch = MD_AUTOLINK.match(markdown_bytes, index) 48 # kind = "autolink" 49 elif ch == b"#"[0]: 50 rmatch = HASHTAG.match(markdown_bytes, index) 51 kind = "hashtag" 52 elif ch == b"@"[0]: 53 rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index) 54 kind = "mention" 55 else: 56 rmatch = URL.match(markdown_bytes, index) 57 kind = "url" 58 59 if rmatch: 60 start, end = rmatch.start(), rmatch.end() 61 if end == index: 62 index += 1 63 continue 64 events.append((start, end, rmatch, kind)) 65 index = end 66 continue 67 68 index += 1 69 70 events.sort(key=lambda x: x[0]) 71 72 last_end: int = 0 73 for start, end, _, _ in events: 74 if start > end: 75 raise Exception(f"Invalid fragment position start={start}, end={end}") 76 if last_end > start: 77 raise Exception( 78 f"Overlapping text fragments at position end={last_end}, start={start}" 79 ) 80 last_end = end 81 82 ntext: bytearray = bytearray() 83 nfragments: list[f.Fragment] = [] 84 85 offset: int = 0 86 last_index: int = 0 87 88 for start, end, rmatch, event in events: 89 ntext.extend(markdown_bytes[last_index:start]) 90 91 if isinstance(rmatch, f.Fragment): 92 ntext.extend(markdown_bytes[start:end]) 93 nfg = replace(rmatch, start=start + offset, end=end + offset) 94 nfragments.append(nfg) 95 last_index = end 96 continue 97 98 nstart = start + offset 99 match event: 100 case "inline_link": 101 label_bytes: bytes = rmatch.group(1) 102 href_bytes: bytes = rmatch.group(2) 103 104 ntext.extend(label_bytes) 105 106 delta = len(label_bytes) - (end - start) 107 offset += delta 108 109 nend = nstart + len(label_bytes) 110 nfragments.append( 111 f.LinkFragment( 112 start=nstart, end=nend, url=href_bytes.decode("utf-8") 113 ) 114 ) 115 116 case "hashtag": 117 tag_bytes: bytes = rmatch.group(1) 118 ntext.extend(markdown_bytes[start:end]) 119 nend = end + offset 120 nfragments.append( 121 f.TagFragment( 122 start=nstart, end=nend, tag=tag_bytes.decode("utf-8") 123 ) 124 ) 125 126 case "mention": 127 mention_bytes: bytes = rmatch.group(0) 128 ntext.extend(markdown_bytes[start:end]) 129 130 mention_str = mention_bytes.decode("utf-8") 131 mention_str = ( 132 mention_str[1:] if mention_str.startswith("@") else mention_str 133 ) 134 135 nend = end + offset 136 nfragments.append( 137 f.MentionFragment(start=nstart, end=nend, uri=mention_str) 138 ) 139 140 case "url": 141 url_bytes: bytes = rmatch.group(0) 142 ntext.extend(markdown_bytes[start:end]) 143 nend = end + offset 144 nfragments.append( 145 f.LinkFragment( 146 start=nstart, end=nend, url=url_bytes.decode("utf-8") 147 ) 148 ) 149 150 case _: 151 pass 152 last_index = end 153 154 ntext.extend(markdown_bytes[last_index:]) 155 156 return ntext.decode("utf-8"), nfragments