from dataclasses import replace import re import cross.fragments as f from util.html import HTMLToFragmentsParser URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) MD_INLINE_LINK = re.compile( rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", re.IGNORECASE, ) MD_AUTOLINK = re.compile( rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE ) HASHTAG = re.compile(rb"(? tuple[str, list[f.Fragment]]: if not text: return "", [] html_parser = HTMLToFragmentsParser() html_parser.feed(text) markdown, fragments = html_parser.get_result() markdown_bytes: bytes = markdown.encode("utf-8") index: int = 0 total: int = len(markdown_bytes) events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = [] events.extend([(fg.start, fg.end, fg, "html") for fg in fragments]) while index < total: ch: int = markdown_bytes[index] rmatch: re.Match[bytes] | None = None kind = None if ch == b"["[0]: rmatch = MD_INLINE_LINK.match(markdown_bytes, index) kind = "inline_link" # elif ch == b"<"[0]: # rmatch = MD_AUTOLINK.match(markdown_bytes, index) # kind = "autolink" elif ch == b"#"[0]: rmatch = HASHTAG.match(markdown_bytes, index) kind = "hashtag" elif ch == b"@"[0]: rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index) kind = "mention" else: rmatch = URL.match(markdown_bytes, index) kind = "url" if rmatch: start, end = rmatch.start(), rmatch.end() if end == index: index += 1 continue events.append((start, end, rmatch, kind)) index = end continue index += 1 events.sort(key=lambda x: x[0]) last_end: int = 0 for start, end, _, _ in events: if start > end: raise Exception(f"Invalid fragment position start={start}, end={end}") if last_end > start: raise Exception( f"Overlapping text fragments at position end={last_end}, start={start}" ) last_end = end ntext: bytearray = bytearray() nfragments: list[f.Fragment] = [] offset: int = 0 last_index: int = 0 for start, end, rmatch, event in events: ntext.extend(markdown_bytes[last_index:start]) if isinstance(rmatch, f.Fragment): ntext.extend(markdown_bytes[start:end]) nfg = replace(rmatch, start=start + offset, end=end + offset) nfragments.append(nfg) last_index = end continue nstart = start + offset match event: case "inline_link": label_bytes: bytes = rmatch.group(1) href_bytes: bytes = rmatch.group(2) ntext.extend(label_bytes) delta = len(label_bytes) - (end - start) offset += delta nend = nstart + len(label_bytes) nfragments.append( f.LinkFragment( start=nstart, end=nend, url=href_bytes.decode("utf-8") ) ) case "hashtag": tag_bytes: bytes = rmatch.group(1) ntext.extend(markdown_bytes[start:end]) nend = end + offset nfragments.append( f.TagFragment( start=nstart, end=nend, tag=tag_bytes.decode("utf-8") ) ) case "mention": mention_bytes: bytes = rmatch.group(0) ntext.extend(markdown_bytes[start:end]) mention_str = mention_bytes.decode("utf-8") mention_str = ( mention_str[1:] if mention_str.startswith("@") else mention_str ) nend = end + offset nfragments.append( f.MentionFragment(start=nstart, end=nend, uri=mention_str) ) case "url": url_bytes: bytes = rmatch.group(0) ntext.extend(markdown_bytes[start:end]) nend = end + offset nfragments.append( f.LinkFragment( start=nstart, end=nend, url=url_bytes.decode("utf-8") ) ) case _: pass last_index = end ntext.extend(markdown_bytes[last_index:]) return ntext.decode("utf-8"), nfragments