social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1from dataclasses import replace 2import re 3import cross.fragments as f 4from util.html import HTMLToFragmentsParser 5 6URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 7MD_INLINE_LINK = re.compile( 8 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 9 re.IGNORECASE, 10) 11MD_AUTOLINK = re.compile( 12 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 13) 14HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 15FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 16 17REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 18 19 20# TODO autolinks are broken by the html parser 21class MarkdownParser: 22 def parse(self, text: str) -> tuple[str, list[f.Fragment]]: 23 if not text: 24 return "", [] 25 26 html_parser = HTMLToFragmentsParser() 27 html_parser.feed(text) 28 markdown, fragments = html_parser.get_result() 29 30 index: int = 0 31 total: int = len(markdown) 32 33 # no match == processed fragments 34 events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = [] 35 events.extend([(fg.start, fg.end, fg, "html") for fg in fragments]) 36 while index < total: 37 ch = markdown[index] 38 rmatch = None 39 kind = None 40 41 if ch == "[": 42 rmatch = MD_INLINE_LINK.match(markdown, index) 43 kind = "inline_link" 44 # elif ch == '<': 45 # rmatch = MD_AUTOLINK.match(markdown, index) 46 # kind = "autolink" 47 elif ch == "#": 48 rmatch = HASHTAG.match(markdown, index) 49 kind = "hashtag" 50 elif ch == "@": 51 rmatch = FEDIVERSE_HANDLE.match(markdown, index) 52 kind = "mention" 53 else: 54 rmatch = URL.match(markdown, index) 55 kind = "url" 56 57 if rmatch: 58 start, end = rmatch.start(), rmatch.end() 59 if end == index: 60 index += 1 61 continue 62 events.append((start, end, rmatch, kind)) 63 index = end 64 continue 65 66 index += 1 67 68 events.sort(key=lambda x: x[0]) 69 70 # validate fragment positions 71 last_end: int = 0 72 for start, end, _, _ in events: 73 if start > end: 74 raise Exception(f"Invalid fragment position start={start}, end={end}") 75 if last_end > start: 76 raise Exception( 77 f"Overlapping text fragments at position end={last_end}, start={start}" 78 ) 79 last_end = end 80 81 ntext: list[str] = [] 82 nfragments: list[f.Fragment] = [] 83 84 offset: int = 0 85 last_index: int = 0 86 87 events.sort(key=lambda x: x[0]) 88 for start, end, rmatch, event in events: 89 ntext.append(markdown[last_index:start]) 90 91 if isinstance(rmatch, f.Fragment): 92 ntext.append(markdown[start:end]) 93 nfg = replace(rmatch, start=start + offset, end=end + offset) 94 nfragments.append(nfg) 95 last_index = end 96 continue 97 98 nstart = start + offset 99 nend = end + offset 100 match event: 101 case "inline_link": 102 label = rmatch.group(1) 103 href = rmatch.group(2) 104 ntext.append(label) 105 106 delta = len(label) - (end - start) 107 offset += delta 108 109 nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href)) 110 case "hashtag": 111 tag = rmatch.group(1) 112 ntext.append(markdown[start:end]) 113 nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag)) 114 case "mention": 115 mention = rmatch.group(0) 116 ntext.append(markdown[start:end]) 117 mention = mention[1:] if mention.startswith("@") else mention 118 nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention)) 119 case "url": 120 url = rmatch.group(0) 121 ntext.append(markdown[start:end]) 122 nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url)) 123 case _: 124 pass 125 last_index = end 126 ntext.append(markdown[last_index:]) 127 128 return ''.join(ntext), nfragments