social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1import re 2import cross.fragments as f 3from util.html import HTMLToFragmentsParser 4 5URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 6MD_INLINE_LINK = re.compile( 7 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 8 re.IGNORECASE, 9) 10MD_AUTOLINK = re.compile( 11 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 12) 13HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 14FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 15 16REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 17 18 19# TODO autolinks are broken by the html parser 20class MarkdownParser: 21 def parse(self, text: str) -> tuple[str, list[f.Fragment]]: 22 if not text: 23 return "", [] 24 25 html_parser = HTMLToFragmentsParser() 26 html_parser.feed(text) 27 markdown, fragments = html_parser.get_result() 28 29 index: int = 0 30 total: int = len(markdown) 31 32 # no match == processed fragments 33 events: list[tuple[int, int, re.Match[str] | None, str]] = [] 34 events.extend([(fg.start, fg.end, None, "html") for fg in fragments]) 35 while index < total: 36 ch = markdown[index] 37 rmatch = None 38 kind = None 39 40 if ch == "[": 41 rmatch = MD_INLINE_LINK.match(markdown, index) 42 kind = "inline_link" 43 # elif ch == '<': 44 # rmatch = MD_AUTOLINK.match(markdown, index) 45 # kind = "autolink" 46 elif ch == "#": 47 rmatch = HASHTAG.match(markdown, index) 48 kind = "hashtag" 49 elif ch == "@": 50 rmatch = FEDIVERSE_HANDLE.match(markdown, index) 51 kind = "mention" 52 else: 53 rmatch = URL.match(markdown, index) 54 kind = "url" 55 56 if rmatch: 57 start, end = rmatch.start(), rmatch.end() 58 if end == index: 59 index += 1 60 continue 61 events.append((start, end, rmatch, kind)) 62 index = end 63 continue 64 65 index += 1 66 67 events.sort(key=lambda x: x[0]) 68 69 # validate fragment positions 70 last_end: int = 0 71 for start, end, _, _ in events: 72 if start > end: 73 raise Exception(f"Invalid fragment position start={start}, end={end}") 74 if last_end > start: 75 raise Exception( 76 f"Overlapping text fragments at position end={last_end}, start={start}" 77 ) 78 last_end = end 79 80 def update_fragments(start: int, s, offset: int): 81 nonlocal fragments 82 83 for fg in fragments: 84 if fg != s and fg.start >= start: 85 fg.start += offset 86 fg.end += offset 87 88 new_text = "" 89 last_pos = 0 90 for start, end, rmatch, event in events: 91 if start > last_pos: 92 new_text += markdown[last_pos:start] 93 94 if not rmatch: 95 new_text += markdown[start:end] 96 last_pos = end 97 continue 98 99 match event: 100 case "inline_link": 101 label = rmatch.group(1) 102 href = rmatch.group(2) 103 fg = f.LinkFragment(start=start, end=start + len(label), url=href) 104 fragments.append(fg) 105 update_fragments(start, fg, -(end - (start + len(label)))) 106 new_text += label 107 # case "autolink": 108 # url = rmatch.group(0) 109 # fg = f.LinkFragment(start=start, end=end - 2, url=url) 110 # fragments.append(fg) 111 # update_fragments(start, fg, -2) 112 # new_text += url 113 case "hashtag": 114 tag = rmatch.group(0) 115 fragments.append( 116 f.TagFragment( 117 start=start, 118 end=end, 119 tag=tag[1:] if tag.startswith("#") else tag, 120 ) 121 ) 122 new_text += markdown[start:end] 123 case "mention": 124 mention = rmatch.group(0) 125 fragments.append( 126 f.MentionFragment( 127 start=start, 128 end=end, 129 uri=mention[1:] if mention.startswith("@") else mention, 130 ) 131 ) 132 new_text += markdown[start:end] 133 case "url": 134 url = rmatch.group(0) 135 fragments.append(f.LinkFragment(start=start, end=end, url=url)) 136 new_text += markdown[start:end] 137 case _: 138 pass 139 last_pos = end 140 if last_pos < len(markdown): 141 new_text += markdown[last_pos:] 142 143 return new_text, fragments