commit 749c26db6b1c90b98b3e54e3796775d832f75e22 · zenfyr.dev/xpost

+87

bluesky/facets.py

···

       1
       +
       from typing import Any, override

     

       2
       +
       import cross.fragments as f

     

       3
       +
       from util.splitter import FragmentSplitter, canonical_label

     

       4
       +
       

     

       5
       +
       LINK = 'app.bsky.richtext.facet#link'

     

       6
       +
       TAG = 'app.bsky.richtext.facet#tag'

     

       7
       +
       MENTION = "app.bsky.richtext.facet#mention"

     

       8
       +
       

     

       9
       +
       class BskySplitter(FragmentSplitter):

     

       10
       +
           def __init__(self):

     

       11
       +
               super().__init__(300, 30)

     

       12
       +
       

     

       13
       +
           @override

     

       14
       +
           def normalize_link(self, label: str, url: str) -> str:

     

       15
       +
               if canonical_label(label, url):

     

       16
       +
                   nlabel = url.split("://", 1)[1]

     

       17
       +
                   if len(nlabel) <= self.urllen:

     

       18
       +
                       return nlabel

     

       19
       +
                   return nlabel[: self.urllen - 1] + "…"

     

       20
       +
               return label

     

       21
       +
       

     

       22
       +
       # TODO handle extending overlapping fragments somehow

     

       23
       +
       def parse_facets(

     

       24
       +
           text: str,

     

       25
       +
           facets: list[dict[str, Any]] | None

     

       26
       +
       ) -> tuple[str, list[f.Fragment]]:

     

       27
       +
           if not facets:

     

       28
       +
               return text, []

     

       29
       +
       

     

       30
       +
           btext = text.encode("utf-8")

     

       31
       +
           nbytes = bytearray()

     

       32
       +
           last_original_byte_index = 0

     

       33
       +
           fragments: list[f.Fragment] = []

     

       34
       +
       

     

       35
       +
           for facet in facets:

     

       36
       +
               original_start: int = facet['index']['byteStart']

     

       37
       +
               original_end: int = facet['index']['byteEnd']

     

       38
       +
       

     

       39
       +
               if last_original_byte_index < original_start:

     

       40
       +
                   nbytes.extend(btext[last_original_byte_index:original_start])

     

       41
       +
       

     

       42
       +
               fdict = {feat['$type']: feat for feat in facet.get('features', [])}

     

       43
       +
       

     

       44
       +
               original_label_bytes = btext[original_start:original_end]

     

       45
       +
               original_label_str = original_label_bytes.decode("utf-8")

     

       46
       +
       

     

       47
       +
               nlabel_bytes = original_label_bytes

     

       48
       +
       

     

       49
       +
               if LINK in fdict:

     

       50
       +
                   url: str = fdict.pop(LINK)['uri']

     

       51
       +
                   label = original_label_str

     

       52
       +
       

     

       53
       +
                   split = url.split("://", 1)

     

       54
       +
                   full_url = False

     

       55
       +
                   if len(split) > 1:

     

       56
       +
                       if split[1].startswith(label):

     

       57
       +
                           full_url = True

     

       58
       +
                       if label.endswith("...") and split[1].startswith(label[:-3]):

     

       59
       +
                           full_url = True

     

       60
       +
       

     

       61
       +
                   if full_url:

     

       62
       +
                       nlabel_bytes = url.encode("utf-8")

     

       63
       +
       

     

       64
       +
                   nstart = len(nbytes)

     

       65
       +
                   nbytes.extend(nlabel_bytes)

     

       66
       +
                   nend = len(nbytes)

     

       67
       +
       

     

       68
       +
                   fragments.append(f.LinkFragment(start=nstart, end=nend, url=url))

     

       69
       +
               else:

     

       70
       +
                   nstart = len(nbytes)

     

       71
       +
                   nbytes.extend(nlabel_bytes)

     

       72
       +
                   nend = len(nbytes)

     

       73
       +
       

     

       74
       +
               if TAG in fdict:

     

       75
       +
                   tag: str = fdict.pop(TAG)['tag']

     

       76
       +
                   fragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))

     

       77
       +
       

     

       78
       +
               if MENTION in fdict:

     

       79
       +
                   did: str = fdict.pop(MENTION)['did']

     

       80
       +
                   fragments.append(f.MentionFragment(start=nstart, end=nend, uri=did))

     

       81
       +
       

     

       82
       +
               last_original_byte_index = original_end

     

       83
       +
       

     

       84
       +
           if last_original_byte_index < len(btext):

     

       85
       +
               nbytes.extend(btext[last_original_byte_index:])

     

       86
       +
       

     

       87
       +
           return nbytes.decode("utf-8"), fragments

+5 -2

bluesky/input.py

···

       8
        
       import websockets

     

       9
        
       

     

       10
        
       from atproto.util import AtUri

     

       0
        
       
     

       11
        
       from bluesky.info import SERVICE, BlueskyService, validate_and_transform

     

       12
        
       from cross.attachments import (

     

       13
        
           LabelsAttachment,

     
···

       75
        
                       )

     

       76
        
                       return

     

       77
        
       

     

       78
       -
               # TODO FRAGMENTS

     

       79
       -
               post = Post(id=post_uri, parent_id=parent_uri, text=record["text"])

     

       0
        
       
     

       0
        
       
     

       80
        
               did, _, rid = AtUri.record_uri(post_uri)

     

       81
        
               post.attachments.put(

     

       82
        
                   RemoteUrlAttachment(url=f"https://bsky.app/profile/{did}/post/{rid}")

···

       8
        
       import websockets

     

       9
        
       

     

       10
        
       from atproto.util import AtUri

     

       11
       +
       from bluesky.facets import parse_facets

     

       12
        
       from bluesky.info import SERVICE, BlueskyService, validate_and_transform

     

       13
        
       from cross.attachments import (

     

       14
        
           LabelsAttachment,

     
···

       76
        
                       )

     

       77
        
                       return

     

       78
        
       

     

       79
       +
               text, fragments = parse_facets(record["text"], record.get('facets'))

     

       80
       +
               post = Post(id=post_uri, parent_id=parent_uri, text=text)

     

       81
       +
               post.fragments.extend(fragments)

     

       82
       +
       

     

       83
        
               did, _, rid = AtUri.record_uri(post_uri)

     

       84
        
               post.attachments.put(

     

       85
        
                   RemoteUrlAttachment(url=f"https://bsky.app/profile/{did}/post/{rid}")

+25 -27

util/html.py

···

       2
        
       from typing import override

     

       3
        
       import cross.fragments as f

     

       4
        
       

     

       5
       -
       

     

       6
        
       class HTMLToFragmentsParser(HTMLParser):

     

       7
        
           def __init__(self) -> None:

     

       8
        
               super().__init__()

     

       9
       -
               self.text: str = ""

     

       10
        
               self.fragments: list[f.Fragment] = []

     

       11
        
       

     

       12
        
               self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}

     

       13
        
               self.in_pre: bool = False

     

       14
        
               self.in_code: bool = False

     

       15
       -
       

     

       16
        
               self.invisible: bool = False

     

       17
        
       

     

       18
        
           def handle_a_endtag(self):

     

       19
       -
               current_end = len(self.text)

     

       20
        
               start, _attr = self._tag_stack.pop("a")

     

       21
        
       

     

       22
        
               href = _attr.get('href')

     
···

       30
        
               _attr = dict(attrs)

     

       31
        
       

     

       32
        
               def append_newline():

     

       33
       -
                   if self.text and not self.text.endswith("\n"):

     

       34
       -
                       self.text += "\n"

     

       35
        
       

     

       36
        
               if self.invisible:

     

       37
        
                   return

     
···

       42
        
                       if cls and 'quote-inline' in cls:

     

       43
        
                           self.invisible = True

     

       44
        
                   case "a":

     

       45
       -
                       self._tag_stack["a"] = (len(self.text), _attr)

     

       46
        
                   case "code":

     

       47
        
                       if not self.in_pre:

     

       48
       -
                           self.text += "`"

     

       49
        
                           self.in_code = True

     

       50
        
                   case "pre":

     

       51
        
                       append_newline()

     

       52
       -
                       self.text += "```\n"

     

       53
        
                       self.in_pre = True

     

       54
        
                   case "blockquote":

     

       55
        
                       append_newline()

     

       56
       -
                       self.text += "> "

     

       57
        
                   case "strong" | "b":

     

       58
       -
                       self.text += "**"

     

       59
        
                   case "em" | "i":

     

       60
       -
                       self.text += "*"

     

       61
        
                   case "del" | "s":

     

       62
       -
                       self.text += "~~"

     

       63
        
                   case "br":

     

       64
       -
                       self.text += "\n"

     

       65
        
                   case _:

     

       66
        
                       if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:

     

       67
        
                           level = int(tag[1])

     

       68
       -
                           self.text += "\n" + "#" * level + " "

     

       69
        
       

     

       70
        
           @override

     

       71
        
           def handle_endtag(self, tag: str) -> None:

     
···

       80
        
                           self.handle_a_endtag()

     

       81
        
                   case "code":

     

       82
        
                       if not self.in_pre and self.in_code:

     

       83
       -
                           self.text += "`"

     

       84
        
                           self.in_code = False

     

       85
        
                   case "pre":

     

       86
       -
                       self.text += "\n```\n"

     

       87
        
                       self.in_pre = False

     

       88
        
                   case "blockquote":

     

       89
       -
                       self.text += "\n"

     

       90
        
                   case "strong" | "b":

     

       91
       -
                       self.text += "**"

     

       92
        
                   case "em" | "i":

     

       93
       -
                       self.text += "*"

     

       94
        
                   case "del" | "s":

     

       95
       -
                       self.text += "~~"

     

       96
        
                   case "p":

     

       97
       -
                       self.text += "\n\n"

     

       98
        
                   case _:

     

       99
        
                       if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:

     

       100
       -
                           self.text += '\n'

     

       101
        
       

     

       102
        
           @override

     

       103
        
           def handle_data(self, data: str) -> None:

     

       104
        
               if not self.invisible:

     

       105
       -
                   self.text += data

     

       106
        
       

     

       107
        
           def get_result(self) -> tuple[str, list[f.Fragment]]:

     

       108
       -
               if self.text.endswith('\n\n'):

     

       109
       -
                   return self.text[:-2], self.fragments

     

       110
       -
               return self.text, self.fragments

···

       2
        
       from typing import override

     

       3
        
       import cross.fragments as f

     

       4
        
       

     

       0
        
       
     

       5
        
       class HTMLToFragmentsParser(HTMLParser):

     

       6
        
           def __init__(self) -> None:

     

       7
        
               super().__init__()

     

       8
       +
               self.builder: bytearray = bytearray()

     

       9
        
               self.fragments: list[f.Fragment] = []

     

       10
        
       

     

       11
        
               self._tag_stack: dict[str, tuple[int, dict[str, str | None]]] = {}

     

       12
        
               self.in_pre: bool = False

     

       13
        
               self.in_code: bool = False

     

       0
        
       
     

       14
        
               self.invisible: bool = False

     

       15
        
       

     

       16
        
           def handle_a_endtag(self):

     

       17
       +
               current_end = len(self.builder)

     

       18
        
               start, _attr = self._tag_stack.pop("a")

     

       19
        
       

     

       20
        
               href = _attr.get('href')

     
···

       28
        
               _attr = dict(attrs)

     

       29
        
       

     

       30
        
               def append_newline():

     

       31
       +
                   if self.builder and not self.builder.endswith(b"\n"):

     

       32
       +
                       self.builder.extend(b"\n")

     

       33
        
       

     

       34
        
               if self.invisible:

     

       35
        
                   return

     
···

       40
        
                       if cls and 'quote-inline' in cls:

     

       41
        
                           self.invisible = True

     

       42
        
                   case "a":

     

       43
       +
                       self._tag_stack["a"] = (len(self.builder), _attr)

     

       44
        
                   case "code":

     

       45
        
                       if not self.in_pre:

     

       46
       +
                           self.builder.extend(b"`")

     

       47
        
                           self.in_code = True

     

       48
        
                   case "pre":

     

       49
        
                       append_newline()

     

       50
       +
                       self.builder.extend(b"```\n")

     

       51
        
                       self.in_pre = True

     

       52
        
                   case "blockquote":

     

       53
        
                       append_newline()

     

       54
       +
                       self.builder.extend(b"> ")

     

       55
        
                   case "strong" | "b":

     

       56
       +
                       self.builder.extend(b"**")

     

       57
        
                   case "em" | "i":

     

       58
       +
                       self.builder.extend(b"*")

     

       59
        
                   case "del" | "s":

     

       60
       +
                       self.builder.extend(b"~~")

     

       61
        
                   case "br":

     

       62
       +
                       self.builder.extend(b"\n")

     

       63
        
                   case _:

     

       64
        
                       if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:

     

       65
        
                           level = int(tag[1])

     

       66
       +
                           self.builder.extend(("\n" + "#" * level + " ").encode('utf-8'))

     

       67
        
       

     

       68
        
           @override

     

       69
        
           def handle_endtag(self, tag: str) -> None:

     
···

       78
        
                           self.handle_a_endtag()

     

       79
        
                   case "code":

     

       80
        
                       if not self.in_pre and self.in_code:

     

       81
       +
                           self.builder.extend(b"`")

     

       82
        
                           self.in_code = False

     

       83
        
                   case "pre":

     

       84
       +
                       self.builder.extend(b"\n```\n")

     

       85
        
                       self.in_pre = False

     

       86
        
                   case "blockquote":

     

       87
       +
                       self.builder.extend(b"\n")

     

       88
        
                   case "strong" | "b":

     

       89
       +
                       self.builder.extend(b"**")

     

       90
        
                   case "em" | "i":

     

       91
       +
                       self.builder.extend(b"*")

     

       92
        
                   case "del" | "s":

     

       93
       +
                       self.builder.extend(b"~~")

     

       94
        
                   case "p":

     

       95
       +
                       self.builder.extend(b"\n\n")

     

       96
        
                   case _:

     

       97
        
                       if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:

     

       98
       +
                           self.builder.extend(b'\n')

     

       99
        
       

     

       100
        
           @override

     

       101
        
           def handle_data(self, data: str) -> None:

     

       102
        
               if not self.invisible:

     

       103
       +
                   self.builder.extend(data.encode('utf-8'))

     

       104
        
       

     

       105
        
           def get_result(self) -> tuple[str, list[f.Fragment]]:

     

       106
       +
               if self.builder.endswith(b'\n\n'):

     

       107
       +
                   return self.builder[:-2].decode('utf-8'), self.fragments

     

       108
       +
               return self.builder.decode('utf-8'), self.fragments

+71 -43

util/markdown.py

···

       3
        
       import cross.fragments as f

     

       4
        
       from util.html import HTMLToFragmentsParser

     

       5
        
       

     

       6
       -
       URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)

     

       7
        
       MD_INLINE_LINK = re.compile(

     

       8
       -
           r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",

     

       9
        
           re.IGNORECASE,

     

       10
        
       )

     

       11
        
       MD_AUTOLINK = re.compile(

     

       12
       -
           r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE

     

       13
        
       )

     

       14
       -
       HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")

     

       15
       -
       FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")

     

       16
        
       

     

       17
        
       REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]

     

       18
        
       

     
···

       27
        
               html_parser.feed(text)

     

       28
        
               markdown, fragments = html_parser.get_result()

     

       29
        
       

     

       0
        
       
     

       0
        
       
     

       30
        
               index: int = 0

     

       31
       -
               total: int = len(markdown)

     

       32
        
       

     

       33
       -
               # no match == processed fragments

     

       34
       -
               events: list[tuple[int, int, re.Match[str] | f.Fragment, str]] = []

     

       35
        
               events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])

     

       0
        
       
     

       36
        
               while index < total:

     

       37
       -
                   ch = markdown[index]

     

       38
       -
                   rmatch = None

     

       39
        
                   kind = None

     

       40
        
       

     

       41
       -
                   if ch == "[":

     

       42
       -
                       rmatch = MD_INLINE_LINK.match(markdown, index)

     

       43
        
                       kind = "inline_link"

     

       44
       -
                   # elif ch == '<':

     

       45
       -
                   #    rmatch = MD_AUTOLINK.match(markdown, index)

     

       46
       -
                   #    kind = "autolink"

     

       47
       -
                   elif ch == "#":

     

       48
       -
                       rmatch = HASHTAG.match(markdown, index)

     

       49
        
                       kind = "hashtag"

     

       50
       -
                   elif ch == "@":

     

       51
       -
                       rmatch = FEDIVERSE_HANDLE.match(markdown, index)

     

       52
        
                       kind = "mention"

     

       53
        
                   else:

     

       54
       -
                       rmatch = URL.match(markdown, index)

     

       55
        
                       kind = "url"

     

       56
        
       

     

       57
        
                   if rmatch:

     
···

       67
        
       

     

       68
        
               events.sort(key=lambda x: x[0])

     

       69
        
       

     

       70
       -
               # validate fragment positions

     

       71
        
               last_end: int = 0

     

       72
        
               for start, end, _, _ in events:

     

       73
        
                   if start > end:

     
···

       78
        
                       )

     

       79
        
                   last_end = end

     

       80
        
       

     

       81
       -
               ntext: list[str] = []

     

       82
        
               nfragments: list[f.Fragment] = []

     

       83
        
       

     

       84
        
               offset: int = 0

     

       85
        
               last_index: int = 0

     

       86
        
       

     

       87
       -
               events.sort(key=lambda x: x[0])

     

       88
        
               for start, end, rmatch, event in events:

     

       89
       -
                   ntext.append(markdown[last_index:start])

     

       90
        
       

     

       91
        
                   if isinstance(rmatch, f.Fragment):

     

       92
       -
                       ntext.append(markdown[start:end])

     

       93
        
                       nfg = replace(rmatch, start=start + offset, end=end + offset)

     

       94
        
                       nfragments.append(nfg)

     

       95
        
                       last_index = end

     

       96
        
                       continue

     

       97
        
       

     

       98
        
                   nstart = start + offset

     

       99
       -
                   nend = end + offset

     

       100
        
                   match event:

     

       101
        
                       case "inline_link":

     

       102
       -
                           label = rmatch.group(1)

     

       103
       -
                           href = rmatch.group(2)

     

       104
       -
                           ntext.append(label)

     

       105
        
       

     

       106
       -
                           delta = len(label) - (end - start)

     

       0
        
       
     

       0
        
       
     

       107
        
                           offset += delta

     

       108
        
       

     

       109
       -
                           nfragments.append(f.LinkFragment(start=nstart, end=nstart + len(label), url=href))

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       110
        
                       case "hashtag":

     

       111
       -
                           tag = rmatch.group(1)

     

       112
       -
                           ntext.append(markdown[start:end])

     

       113
       -
                           nfragments.append(f.TagFragment(start=nstart, end=nend, tag=tag))

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       114
        
                       case "mention":

     

       115
       -
                           mention = rmatch.group(0)

     

       116
       -
                           ntext.append(markdown[start:end])

     

       117
       -
                           mention = mention[1:] if mention.startswith("@") else mention

     

       118
       -
                           nfragments.append(f.MentionFragment(start=nstart, end=nend, uri=mention))

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       119
        
                       case "url":

     

       120
       -
                           url = rmatch.group(0)

     

       121
       -
                           ntext.append(markdown[start:end])

     

       122
       -
                           nfragments.append(f.LinkFragment(start=nstart, end=nend, url=url))

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       123
        
                       case _:

     

       124
        
                           pass

     

       125
        
                   last_index = end

     

       126
       -
               ntext.append(markdown[last_index:])

     

       0
        
       
     

       127
        
       

     

       128
       -
               return ''.join(ntext), nfragments

···

       3
        
       import cross.fragments as f

     

       4
        
       from util.html import HTMLToFragmentsParser

     

       5
        
       

     

       6
       +
       URL = re.compile(rb"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)

     

       7
        
       MD_INLINE_LINK = re.compile(

     

       8
       +
           rb"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",

     

       9
        
           re.IGNORECASE,

     

       10
        
       )

     

       11
        
       MD_AUTOLINK = re.compile(

     

       12
       +
           rb"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE

     

       13
        
       )

     

       14
       +
       HASHTAG = re.compile(rb"(?<!\w)\#([\w]+)")

     

       15
       +
       FEDIVERSE_HANDLE = re.compile(rb"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")

     

       16
        
       

     

       17
        
       REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]

     

       18
        
       

     
···

       27
        
               html_parser.feed(text)

     

       28
        
               markdown, fragments = html_parser.get_result()

     

       29
        
       

     

       30
       +
               markdown_bytes: bytes = markdown.encode("utf-8")

     

       31
       +
       

     

       32
        
               index: int = 0

     

       33
       +
               total: int = len(markdown_bytes)

     

       34
        
       

     

       35
       +
               events: list[tuple[int, int, re.Match[bytes] | f.Fragment, str]] = []

     

       0
        
       
     

       36
        
               events.extend([(fg.start, fg.end, fg, "html") for fg in fragments])

     

       37
       +
       

     

       38
        
               while index < total:

     

       39
       +
                   ch: int = markdown_bytes[index]

     

       40
       +
                   rmatch: re.Match[bytes] | None = None

     

       41
        
                   kind = None

     

       42
        
       

     

       43
       +
                   if ch == b"["[0]:

     

       44
       +
                       rmatch = MD_INLINE_LINK.match(markdown_bytes, index)

     

       45
        
                       kind = "inline_link"

     

       46
       +
                   # elif ch == b"<"[0]:

     

       47
       +
                   #     rmatch = MD_AUTOLINK.match(markdown_bytes, index)

     

       48
       +
                   #     kind = "autolink"

     

       49
       +
                   elif ch == b"#"[0]:

     

       50
       +
                       rmatch = HASHTAG.match(markdown_bytes, index)

     

       51
        
                       kind = "hashtag"

     

       52
       +
                   elif ch == b"@"[0]:

     

       53
       +
                       rmatch = FEDIVERSE_HANDLE.match(markdown_bytes, index)

     

       54
        
                       kind = "mention"

     

       55
        
                   else:

     

       56
       +
                       rmatch = URL.match(markdown_bytes, index)

     

       57
        
                       kind = "url"

     

       58
        
       

     

       59
        
                   if rmatch:

     
···

       69
        
       

     

       70
        
               events.sort(key=lambda x: x[0])

     

       71
        
       

     

       0
        
       
     

       72
        
               last_end: int = 0

     

       73
        
               for start, end, _, _ in events:

     

       74
        
                   if start > end:

     
···

       79
        
                       )

     

       80
        
                   last_end = end

     

       81
        
       

     

       82
       +
               ntext: bytearray = bytearray()

     

       83
        
               nfragments: list[f.Fragment] = []

     

       84
        
       

     

       85
        
               offset: int = 0

     

       86
        
               last_index: int = 0

     

       87
        
       

     

       0
        
       
     

       88
        
               for start, end, rmatch, event in events:

     

       89
       +
                   ntext.extend(markdown_bytes[last_index:start])

     

       90
        
       

     

       91
        
                   if isinstance(rmatch, f.Fragment):

     

       92
       +
                       ntext.extend(markdown_bytes[start:end])

     

       93
        
                       nfg = replace(rmatch, start=start + offset, end=end + offset)

     

       94
        
                       nfragments.append(nfg)

     

       95
        
                       last_index = end

     

       96
        
                       continue

     

       97
        
       

     

       98
        
                   nstart = start + offset

     

       0
        
       
     

       99
        
                   match event:

     

       100
        
                       case "inline_link":

     

       101
       +
                           label_bytes: bytes = rmatch.group(1)

     

       102
       +
                           href_bytes: bytes = rmatch.group(2)

     

       0
        
       
     

       103
        
       

     

       104
       +
                           ntext.extend(label_bytes)

     

       105
       +
       

     

       106
       +
                           delta = len(label_bytes) - (end - start)

     

       107
        
                           offset += delta

     

       108
        
       

     

       109
       +
                           nend = nstart + len(label_bytes)

     

       110
       +
                           nfragments.append(

     

       111
       +
                               f.LinkFragment(

     

       112
       +
                                   start=nstart, end=nend, url=href_bytes.decode("utf-8")

     

       113
       +
                               )

     

       114
       +
                           )

     

       115
       +
       

     

       116
        
                       case "hashtag":

     

       117
       +
                           tag_bytes: bytes = rmatch.group(1)

     

       118
       +
                           ntext.extend(markdown_bytes[start:end])

     

       119
       +
                           nend = end + offset

     

       120
       +
                           nfragments.append(

     

       121
       +
                               f.TagFragment(

     

       122
       +
                                   start=nstart, end=nend, tag=tag_bytes.decode("utf-8")

     

       123
       +
                               )

     

       124
       +
                           )

     

       125
       +
       

     

       126
        
                       case "mention":

     

       127
       +
                           mention_bytes: bytes = rmatch.group(0)

     

       128
       +
                           ntext.extend(markdown_bytes[start:end])

     

       129
       +
       

     

       130
       +
                           mention_str = mention_bytes.decode("utf-8")

     

       131
       +
                           mention_str = (

     

       132
       +
                               mention_str[1:] if mention_str.startswith("@") else mention_str

     

       133
       +
                           )

     

       134
       +
       

     

       135
       +
                           nend = end + offset

     

       136
       +
                           nfragments.append(

     

       137
       +
                               f.MentionFragment(start=nstart, end=nend, uri=mention_str)

     

       138
       +
                           )

     

       139
       +
       

     

       140
        
                       case "url":

     

       141
       +
                           url_bytes: bytes = rmatch.group(0)

     

       142
       +
                           ntext.extend(markdown_bytes[start:end])

     

       143
       +
                           nend = end + offset

     

       144
       +
                           nfragments.append(

     

       145
       +
                               f.LinkFragment(

     

       146
       +
                                   start=nstart, end=nend, url=url_bytes.decode("utf-8")

     

       147
       +
                               )

     

       148
       +
                           )

     

       149
       +
       

     

       150
        
                       case _:

     

       151
        
                           pass

     

       152
        
                   last_index = end

     

       153
       +
       

     

       154
       +
               ntext.extend(markdown_bytes[last_index:])

     

       155
        
       

     

       156
       +
               return ntext.decode("utf-8"), nfragments

+36 -32

util/splitter.py

···

       21
        
               self.urllen: int = urllen

     

       22
        
       

     

       23
        
           def normalize_link(self, label: str, url: str) -> str:

     

       24
       -
               #if canonical_label(label, url):

     

       25
       -
               #    if self.urltrunc == "dotted":

     

       26
       -
                   #        nlabel = url.split("://", 1)[1]

     

       27
       -
                   #        if len(nlabel) <= self.urllen:

     

       28
       -
                       #            return nlabel

     

       29
       -
                       #        return nlabel[: self.urllen - 1] + "…"

     

       30
        
               return label

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       31
        
       

     

       32
        
           def url_normalize(

     

       33
       -
               self, text: str, fragments: list[Fragment]

     

       34
       -
           ) -> tuple[str, list[Fragment]]:

     

       35
       -
               if self.urllen == -1:

     

       36
       -
                   return text, fragments

     

       0
        
       
     

       37
        
       

     

       38
       -
               ntext: list[str] = []

     

       39
       -
               nfragments: list[Fragment] = []

     

       40
        
       

     

       41
       -
               offset: int = 0

     

       42
       -
               last_index: int = 0

     

       43
        
       

     

       44
       -
               fragments = [fg for fg in fragments]

     

       45
       -
               fragments.sort(key=lambda x: x.start)

     

       46
        
       

     

       47
       -
               for fg in fragments:

     

       48
       -
                   ntext.append(text[last_index:fg.start])

     

       49
       -
                   label = text[fg.start:fg.end]

     

       50
       -
                   nlabel = label

     

       51
       -
                   if isinstance(fg, LinkFragment):

     

       52
       -
                       nlabel = self.normalize_link(nlabel, fg.url)

     

       53
       -
                   ntext.append(nlabel)

     

       54
        
       

     

       55
       -
                   nfg = replace(fg, start=fg.start + offset)

     

       56
       -
                   change = len(nlabel) - len(label)

     

       57
       -
                   offset += change

     

       58
       -
                   nfg = replace(nfg, end=fg.end + offset)

     

       59
        
       

     

       60
       -
                   nfragments.append(nfg)

     

       61
       -
                   last_index = fg.end

     

       0
        
       
     

       62
        
       

     

       63
       -
               ntext.append(text[last_index:])

     

       64
        
       

     

       65
       -
               return ''.join(ntext), nfragments

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       66
        
       

     

       67
        
           def split(

     

       68
        
               self, text: str, fragments: list[Fragment]

     

       69
        
           ) -> list[tuple[str, list[Fragment]]]:

     

       70
        
               text, fragments = self.url_normalize(text, fragments)

     

       71
       -
               if grapheme.length(text) <= self.climit:

     

       72
        
                   return [(text, fragments)]

···

       21
        
               self.urllen: int = urllen

     

       22
        
       

     

       23
        
           def normalize_link(self, label: str, url: str) -> str:

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       24
        
               return label

     

       25
       +
       

     

       26
       +
           def tally_lenght(self, post: tuple[str, list[Fragment]]):

     

       27
       +
               return grapheme.length(post[0])

     

       28
        
       

     

       29
        
           def url_normalize(

     

       30
       +
                   self, text: str, fragments: list[Fragment]

     

       31
       +
               ) -> tuple[str, list[Fragment]]:

     

       32
       +
                   if self.urllen == -1:

     

       33
       +
                       return text, fragments

     

       34
       +
                   btext = text.encode('utf-8')

     

       35
        
       

     

       36
       +
                   nbytes = bytearray()

     

       37
       +
                   nfragments: list[Fragment] = []

     

       38
        
       

     

       39
       +
                   fragments = [fg for fg in fragments]

     

       40
       +
                   fragments.sort(key=lambda x: x.start)

     

       41
        
       

     

       42
       +
                   last_index = 0

     

       0
        
       
     

       43
        
       

     

       44
       +
                   for fg in fragments:

     

       45
       +
                       if last_index < fg.start:

     

       46
       +
                           nbytes.extend(btext[last_index:fg.start])

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       47
        
       

     

       48
       +
                       label_bytes = btext[fg.start:fg.end]

     

       49
       +
                       label = label_bytes.decode('utf-8')

     

       0
        
       
     

       0
        
       
     

       50
        
       

     

       51
       +
                       nlabel = label

     

       52
       +
                       if isinstance(fg, LinkFragment):

     

       53
       +
                           nlabel = self.normalize_link(nlabel, fg.url)

     

       54
        
       

     

       55
       +
                       nlabel_bytes = nlabel.encode('utf-8')

     

       56
        
       

     

       57
       +
                       nstart = len(nbytes)

     

       58
       +
                       nbytes.extend(nlabel_bytes)

     

       59
       +
                       nend = len(nbytes)

     

       60
       +
       

     

       61
       +
                       nfg = replace(fg, start=nstart, end=nend)

     

       62
       +
                       nfragments.append(nfg)

     

       63
       +
       

     

       64
       +
                       last_index = fg.end

     

       65
       +
       

     

       66
       +
                   if last_index < len(btext):

     

       67
       +
                       nbytes.extend(btext[last_index:])

     

       68
       +
       

     

       69
       +
                   return nbytes.decode('utf-8'), nfragments

     

       70
        
       

     

       71
        
           def split(

     

       72
        
               self, text: str, fragments: list[Fragment]

     

       73
        
           ) -> list[tuple[str, list[Fragment]]]:

     

       74
        
               text, fragments = self.url_normalize(text, fragments)

     

       75
       +
               if self.tally_lenght((text, fragments)) <= self.climit:

     

       76
        
                   return [(text, fragments)]