···
import cross.fragments as f
from util.html import HTMLToFragmentsParser
6
-
URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
6
+
URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
MD_INLINE_LINK = re.compile(
8
-
r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
8
+
rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
MD_AUTOLINK = re.compile(
12
-
r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
12
+
rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
14
-
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
15
-
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
14
+
HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")
15
+
FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
···
markdown, fragments = html_parser.get_result()
30
+
markdown_bytes: bytes = markdown.encode("utf-8")
31
-
total: int = len(markdown)
33
+
total: int = len(markdown_bytes)
33
-
# no match == processed fragments
34
-
events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = []
35
+
events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []
events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])
37
-
ch = markdown[index]
39
+
ch: int = markdown_bytes[index]
40
+
rmatch: re.Match[bytes] | None = None
42
-
rmatch = MD_INLINE_LINK.match(markdown, index)
44
+
rmatch = MD_INLINE_LINK.match(markdown_bytes, index)
45
-
# rmatch = MD_AUTOLINK.match(markdown, index)
48
-
rmatch = HASHTAG.match(markdown, index)
46
+
# elif ch == b"<"[0]:
47
+
# rmatch = MD_AUTOLINK.match(markdown_bytes, index)
50
+
rmatch = HASHTAG.match(markdown_bytes, index)
51
-
rmatch = FEDIVERSE_HANDLE.match(markdown, index)
53
+
rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)
54
-
rmatch = URL.match(markdown, index)
56
+
rmatch = URL.match(markdown_bytes, index)
···
events.sort(key=lambda x: x[0])
70
-
# validate fragment positions
for start, end, _, _ in events:
···
81
-
ntext: list[str] = []
82
+
ntext: bytearray = bytearray()
nfragments: list[f.Fragment] = []
87
-
events.sort(key=lambda x: x[0])
for start, end, rmatch, event in events:
89
-
ntext.append(markdown[last_index:start])
89
+
ntext.extend(markdown_bytes[last_index:start])
if isinstance(rmatch, f.Fragment):
92
-
ntext.append(markdown[start:end])
92
+
ntext.extend(markdown_bytes[start:end])
nfg = replace(rmatch, start=start + offset, end=end + offset)
102
-
label = rmatch.group(1)
103
-
href = rmatch.group(2)
104
-
ntext.append(label)
101
+
label_bytes: bytes = rmatch.group(1)
102
+
href_bytes: bytes = rmatch.group(2)
106
-
delta = len(label) - (end - start)
104
+
ntext.extend(label_bytes)
106
+
delta = len(label_bytes) - (end - start)
109
-
nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href))
109
+
nend = nstart + len(label_bytes)
112
+
start=nstart, end=nend, url=href_bytes.decode("utf-8")
111
-
tag = rmatch.group(1)
112
-
ntext.append(markdown[start:end])
113
-
nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))
117
+
tag_bytes: bytes = rmatch.group(1)
118
+
ntext.extend(markdown_bytes[start:end])
119
+
nend = end + offset
122
+
start=nstart, end=nend, tag=tag_bytes.decode("utf-8")
115
-
mention = rmatch.group(0)
116
-
ntext.append(markdown[start:end])
117
-
mention = mention[1:] if mention.startswith("@") else mention
118
-
nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention))
127
+
mention_bytes: bytes = rmatch.group(0)
128
+
ntext.extend(markdown_bytes[start:end])
130
+
mention_str = mention_bytes.decode("utf-8")
132
+
mention_str[1:] if mention_str.startswith("@") else mention_str
135
+
nend = end + offset
137
+
f.MentionFragment(start=nstart, end=nend, uri=mention_str)
120
-
url = rmatch.group(0)
121
-
ntext.append(markdown[start:end])
122
-
nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url))
141
+
url_bytes: bytes = rmatch.group(0)
142
+
ntext.extend(markdown_bytes[start:end])
143
+
nend = end + offset
146
+
start=nstart, end=nend, url=url_bytes.decode("utf-8")
126
-
ntext.append(markdown[last_index:])
154
+
ntext.extend(markdown_bytes[last_index:])
128
-
return ''.join(ntext), nfragments
156
+
return ntext.decode("utf-8"), nfragments