social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
1import re 2 3import cross 4import util.html_util as html_util 5import util.util as util 6 7URL = re.compile(r'(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+', re.IGNORECASE) 8MD_INLINE_LINK = re.compile(r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE) 9MD_AUTOLINK = re.compile(r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE) 10HASHTAG = re.compile(r'(?<!\w)\#([\w]+)') 11FEDIVERSE_HANDLE = re.compile(r'(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?') 12 13def tokenize_markdown(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]: 14 if not text: 15 return [] 16 17 tokenizer = html_util.HTMLPostTokenizer() 18 tokenizer.mentions = handles 19 tokenizer.tags = tags 20 tokenizer.feed(text) 21 html_tokens = tokenizer.get_tokens() 22 23 tokens: list[cross.Token] = [] 24 25 for tk in html_tokens: 26 if isinstance(tk, cross.TextToken): 27 tokens.extend(__tokenize_md(tk.text, tags, handles)) 28 elif isinstance(tk, cross.LinkToken): 29 if not tk.label or util.canonical_label(tk.label, tk.href): 30 tokens.append(tk) 31 continue 32 33 tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)) 34 else: 35 tokens.append(tk) 36 37 return tokens 38 39 40def __tokenize_md(text: str, tags: list[str], handles: list[tuple[str, str]]) -> list[cross.Token]: 41 index: int = 0 42 total: int = len(text) 43 buffer: list[str] = [] 44 45 tokens: list[cross.Token] = [] 46 47 def flush(): 48 nonlocal buffer 49 if buffer: 50 tokens.append(cross.TextToken(''.join(buffer))) 51 buffer = [] 52 53 while index < total: 54 if text[index] == '[': 55 md_inline = MD_INLINE_LINK.match(text, index) 56 if md_inline: 57 flush() 58 label = md_inline.group(1) 59 href = md_inline.group(2) 60 tokens.append(cross.LinkToken(href, label)) 61 index = md_inline.end() 62 continue 63 64 if text[index] == '<': 65 md_auto = MD_AUTOLINK.match(text, index) 66 if md_auto: 67 flush() 68 href = md_auto.group(1) 69 tokens.append(cross.LinkToken(href, href)) 70 index = md_auto.end() 71 continue 72 73 if text[index] == '#': 74 tag = HASHTAG.match(text, index) 75 if tag: 76 tag_text = tag.group(1) 77 if tag_text.lower() in tags: 78 flush() 79 tokens.append(cross.TagToken(tag_text)) 80 index = tag.end() 81 continue 82 83 if text[index] == '@': 84 handle = FEDIVERSE_HANDLE.match(text, index) 85 if handle: 86 handle_text = handle.group(0) 87 stripped_handle = handle_text.strip() 88 89 match = next( 90 (pair for pair in handles if stripped_handle in pair), 91 None 92 ) 93 94 if match: 95 flush() 96 tokens.append(cross.MentionToken(match[1], '')) # TODO: misskey doesn’t provide a uri 97 index = handle.end() 98 continue 99 100 url = URL.match(text, index) 101 if url: 102 flush() 103 href = url.group(0) 104 tokens.append(cross.LinkToken(href, href)) 105 index = url.end() 106 continue 107 108 buffer.append(text[index]) 109 index += 1 110 111 flush() 112 return tokens