social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at next 4.2 kB view raw
1import re 2 3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token 4from util.html import HTMLToTokensParser 5from util.splitter import canonical_label 6 7URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 8MD_INLINE_LINK = re.compile( 9 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 10 re.IGNORECASE, 11) 12MD_AUTOLINK = re.compile( 13 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 14) 15HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 16FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 17 18REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 19 20 21# TODO autolinks are broken by the html parser 22class MarkdownParser: 23 def parse( 24 self, text: str, tags: list[str], handles: list[tuple[str, str]] 25 ) -> list[Token]: 26 if not text: 27 return [] 28 29 tokenizer = HTMLToTokensParser() 30 tokenizer.feed(text) 31 html_tokens = tokenizer.get_result() 32 33 tokens: list[Token] = [] 34 35 for tk in html_tokens: 36 if isinstance(tk, TextToken): 37 tokens.extend(self.__tokenize_md(tk.text, tags, handles)) 38 elif isinstance(tk, LinkToken): 39 if not tk.label or canonical_label(tk.label, tk.href): 40 tokens.append(tk) 41 continue 42 43 tokens.extend( 44 self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles) 45 ) 46 else: 47 tokens.append(tk) 48 49 return tokens 50 51 def __tokenize_md( 52 self, text: str, tags: list[str], handles: list[tuple[str, str]] 53 ) -> list[Token]: 54 index: int = 0 55 total: int = len(text) 56 buffer: list[str] = [] 57 58 tokens: list[Token] = [] 59 60 def flush(): 61 nonlocal buffer 62 if buffer: 63 tokens.append(TextToken(text="".join(buffer))) 64 buffer = [] 65 66 while index < total: 67 if text[index] == "[": 68 md_inline = MD_INLINE_LINK.match(text, index) 69 if md_inline: 70 flush() 71 label = md_inline.group(1) 72 href = md_inline.group(2) 73 tokens.append(LinkToken(href=href, label=label)) 74 index = md_inline.end() 75 continue 76 77 if text[index] == "<": 78 md_auto = MD_AUTOLINK.match(text, index) 79 if md_auto: 80 flush() 81 href = md_auto.group(1) 82 tokens.append(LinkToken(href=href, label=None)) 83 index = md_auto.end() 84 continue 85 86 if text[index] == "#": 87 tag = HASHTAG.match(text, index) 88 if tag: 89 tag_text = tag.group(1) 90 if tag_text.lower() in tags: 91 flush() 92 tokens.append(TagToken(tag=tag_text)) 93 index = tag.end() 94 continue 95 96 if text[index] == "@": 97 handle = FEDIVERSE_HANDLE.match(text, index) 98 if handle: 99 handle_text = handle.group(0) 100 stripped_handle = handle_text.strip() 101 102 match = next( 103 (pair for pair in handles if stripped_handle in pair), None 104 ) 105 106 if match: 107 flush() 108 tokens.append( 109 MentionToken(username=match[1], uri=None) 110 ) # TODO: misskey doesn’t provide a uri 111 index = handle.end() 112 continue 113 114 url = URL.match(text, index) 115 if url: 116 flush() 117 href = url.group(0) 118 tokens.append(LinkToken(href=href, label=None)) 119 index = url.end() 120 continue 121 122 buffer.append(text[index]) 123 index += 1 124 125 flush() 126 return tokens