import re import cross.fragments as f from util.html import HTMLToFragmentsParser URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) MD_INLINE_LINK = re.compile( r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE, ) MD_AUTOLINK = re.compile( r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE ) HASHTAG = re.compile(r"(? tuple[str, list[f.Fragment]]: if not text: return "", [] html_parser = HTMLToFragmentsParser() html_parser.feed(text) markdown, fragments = html_parser.get_result() index: int = 0 total: int = len(markdown) # no match == processed fragments events: list[tuple[int, int, re.Match[str] | None, str]] = [] events.extend([(fg.start, fg.end, None, "html") for fg in fragments]) while index < total: ch = markdown[index] rmatch = None kind = None if ch == "[": rmatch = MD_INLINE_LINK.match(markdown, index) kind = "inline_link" # elif ch == '<': # rmatch = MD_AUTOLINK.match(markdown, index) # kind = "autolink" elif ch == "#": rmatch = HASHTAG.match(markdown, index) kind = "hashtag" elif ch == "@": rmatch = FEDIVERSE_HANDLE.match(markdown, index) kind = "mention" else: rmatch = URL.match(markdown, index) kind = "url" if rmatch: start, end = rmatch.start(), rmatch.end() if end == index: index += 1 continue events.append((start, end, rmatch, kind)) index = end continue index += 1 events.sort(key=lambda x: x[0]) # validate fragment positions last_end: int = 0 for start, end, _, _ in events: if start > end: raise Exception(f"Invalid fragment position start={start}, end={end}") if last_end > start: raise Exception( f"Overlapping text fragments at position end={last_end}, start={start}" ) last_end = end def update_fragments(start: int, s, offset: int): nonlocal fragments for fg in fragments: if fg != s and fg.start >= start: fg.start += offset fg.end += offset new_text = "" last_pos = 0 for start, end, rmatch, event in events: if start > last_pos: new_text += markdown[last_pos:start] if not rmatch: new_text += markdown[start:end] last_pos = end continue match event: case "inline_link": label = rmatch.group(1) href = rmatch.group(2) fg = f.LinkFragment(start=start, end=start + len(label), url=href) fragments.append(fg) update_fragments(start, fg, -(end - (start + len(label)))) new_text += label # case "autolink": # url = rmatch.group(0) # fg = f.LinkFragment(start=start, end=end - 2, url=url) # fragments.append(fg) # update_fragments(start, fg, -2) # new_text += url case "hashtag": tag = rmatch.group(0) fragments.append( f.TagFragment( start=start, end=end, tag=tag[1:] if tag.startswith("#") else tag, ) ) new_text += markdown[start:end] case "mention": mention = rmatch.group(0) fragments.append( f.MentionFragment( start=start, end=end, uri=mention[1:] if mention.startswith("@") else mention, ) ) new_text += markdown[start:end] case "url": url = rmatch.group(0) fragments.append(f.LinkFragment(start=start, end=end, url=url)) new_text += markdown[start:end] case _: pass last_pos = end if last_pos < len(markdown): new_text += markdown[last_pos:] return new_text, fragments