social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at master 3.6 kB view raw
1import re 2 3import cross 4import util.html_util as html_util 5import util.util as util 6 7URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 8MD_INLINE_LINK = re.compile( 9 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 10 re.IGNORECASE, 11) 12MD_AUTOLINK = re.compile( 13 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 14) 15HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 16FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 17 18 19def tokenize_markdown( 20 text: str, tags: list[str], handles: list[tuple[str, str]] 21) -> list[cross.Token]: 22 if not text: 23 return [] 24 25 tokenizer = html_util.HTMLPostTokenizer() 26 tokenizer.mentions = handles 27 tokenizer.tags = tags 28 tokenizer.feed(text) 29 html_tokens = tokenizer.get_tokens() 30 31 tokens: list[cross.Token] = [] 32 33 for tk in html_tokens: 34 if isinstance(tk, cross.TextToken): 35 tokens.extend(__tokenize_md(tk.text, tags, handles)) 36 elif isinstance(tk, cross.LinkToken): 37 if not tk.label or util.canonical_label(tk.label, tk.href): 38 tokens.append(tk) 39 continue 40 41 tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)) 42 else: 43 tokens.append(tk) 44 45 return tokens 46 47 48def __tokenize_md( 49 text: str, tags: list[str], handles: list[tuple[str, str]] 50) -> list[cross.Token]: 51 index: int = 0 52 total: int = len(text) 53 buffer: list[str] = [] 54 55 tokens: list[cross.Token] = [] 56 57 def flush(): 58 nonlocal buffer 59 if buffer: 60 tokens.append(cross.TextToken("".join(buffer))) 61 buffer = [] 62 63 while index < total: 64 if text[index] == "[": 65 md_inline = MD_INLINE_LINK.match(text, index) 66 if md_inline: 67 flush() 68 label = md_inline.group(1) 69 href = md_inline.group(2) 70 tokens.append(cross.LinkToken(href, label)) 71 index = md_inline.end() 72 continue 73 74 if text[index] == "<": 75 md_auto = MD_AUTOLINK.match(text, index) 76 if md_auto: 77 flush() 78 href = md_auto.group(1) 79 tokens.append(cross.LinkToken(href, href)) 80 index = md_auto.end() 81 continue 82 83 if text[index] == "#": 84 tag = HASHTAG.match(text, index) 85 if tag: 86 tag_text = tag.group(1) 87 if tag_text.lower() in tags: 88 flush() 89 tokens.append(cross.TagToken(tag_text)) 90 index = tag.end() 91 continue 92 93 if text[index] == "@": 94 handle = FEDIVERSE_HANDLE.match(text, index) 95 if handle: 96 handle_text = handle.group(0) 97 stripped_handle = handle_text.strip() 98 99 match = next( 100 (pair for pair in handles if stripped_handle in pair), None 101 ) 102 103 if match: 104 flush() 105 tokens.append( 106 cross.MentionToken(match[1], "") 107 ) # TODO: misskey doesn’t provide a uri 108 index = handle.end() 109 continue 110 111 url = URL.match(text, index) 112 if url: 113 flush() 114 href = url.group(0) 115 tokens.append(cross.LinkToken(href, href)) 116 index = url.end() 117 continue 118 119 buffer.append(text[index]) 120 index += 1 121 122 flush() 123 return tokens