···
import cross.fragments as f
from util.html import HTMLToFragmentsParser
-
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
MD_INLINE_LINK = re.compile(
-
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
MD_AUTOLINK = re.compile(
-
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
-
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
-
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
···
markdown, fragments = html_parser.get_result()
-
total: int = len(markdown)
-
# no match == processed fragments
-
events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = []
events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
-
rmatch = MD_INLINE_LINK.match(markdown, index)
-
# rmatch = MD_AUTOLINK.match(markdown, index)
-
rmatch = HASHTAG.match(markdown, index)
-
rmatch = FEDIVERSE_HANDLE.match(markdown, index)
-
rmatch = URL.match(markdown, index)
···
events.sort(key=lambda x: x[0])
-
# validate fragment positions
for start, end, _, _ in events:
···
nfragments: list[f.Fragment] = []
-
events.sort(key=lambda x: x[0])
for start, end, rmatch, event in events:
-
ntext.append(markdown[last_index:start])
if isinstance(rmatch, f.Fragment):
-
ntext.append(markdown[start:end])
nfg = replace(rmatch, start=start + offset, end=end + offset)
-
label = rmatch.group(1)
-
delta = len(label) - (end - start)
-
nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href))
-
ntext.append(markdown[start:end])
-
nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
-
mention = rmatch.group(0)
-
ntext.append(markdown[start:end])
-
mention = mention[1:] if mention.startswith("@") else mention
-
nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention))
-
ntext.append(markdown[start:end])
-
nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
-
ntext.append(markdown[last_index:])
-
return ''.join(ntext), nfragments
···
import cross.fragments as f
from util.html import HTMLToFragmentsParser
+
URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
MD_INLINE_LINK = re.compile(
+
rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
MD_AUTOLINK = re.compile(
+
rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
+
HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
+
FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
···
markdown, fragments = html_parser.get_result()
+
markdown_bytes: bytes = markdown.encode("utf-8")
+
total: int = len(markdown_bytes)
+
events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
+
ch: int = markdown_bytes[index]
+
rmatch: re.Match[bytes] | None = None
+
rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
+
# rmatch = MD_AUTOLINK.match(markdown_bytes, index)
+
rmatch = HASHTAG.match(markdown_bytes, index)
+
rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
+
rmatch = URL.match(markdown_bytes, index)
···
events.sort(key=lambda x: x[0])
for start, end, _, _ in events:
···
+
ntext: bytearray = bytearray()
nfragments: list[f.Fragment] = []
for start, end, rmatch, event in events:
+
ntext.extend(markdown_bytes[last_index:start])
if isinstance(rmatch, f.Fragment):
+
ntext.extend(markdown_bytes[start:end])
nfg = replace(rmatch, start=start + offset, end=end + offset)
+
label_bytes: bytes = rmatch.group(1)
+
href_bytes: bytes = rmatch.group(2)
+
ntext.extend(label_bytes)
+
delta = len(label_bytes) - (end - start)
+
nend = nstart + len(label_bytes)
+
start=nstart, end=nend, url=href_bytes.decode("utf-8")
+
tag_bytes: bytes = rmatch.group(1)
+
ntext.extend(markdown_bytes[start:end])
+
start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
+
mention_bytes: bytes = rmatch.group(0)
+
ntext.extend(markdown_bytes[start:end])
+
mention_str = mention_bytes.decode("utf-8")
+
mention_str[1:] if mention_str.startswith("@") else mention_str
+
f.MentionFragment(start=nstart, end=nend, uri=mention_str)
+
url_bytes: bytes = rmatch.group(0)
+
ntext.extend(markdown_bytes[start:end])
+
start=nstart, end=nend, url=url_bytes.decode("utf-8")
+
ntext.extend(markdown_bytes[last_index:])
+
return ntext.decode("utf-8"), nfragments