import grapheme from cross.fragments import Fragment, LinkFragment from dataclasses import replace def canonical_label(label: str | None, href: str): if not label or label == href: return True split = href.split("://", 1) if len(split) > 1: if split[1] == label: return True return False class FragmentSplitter: def __init__(self, climit: int, urllen: int): self.climit: int = climit self.urllen: int = urllen def normalize_link(self, label: str, url: str) -> str: return label def tally_lenght(self, post: tuple[str, list[Fragment]]): return grapheme.length(post[0]) def url_normalize( self, text: str, fragments: list[Fragment] ) -> tuple[str, list[Fragment]]: if self.urllen == -1: return text, fragments btext = text.encode('utf-8') nbytes = bytearray() nfragments: list[Fragment] = [] fragments = [fg for fg in fragments] fragments.sort(key=lambda x: x.start) last_index = 0 for fg in fragments: if last_index < fg.start: nbytes.extend(btext[last_index:fg.start]) label_bytes = btext[fg.start:fg.end] label = label_bytes.decode('utf-8') nlabel = label if isinstance(fg, LinkFragment): nlabel = self.normalize_link(nlabel, fg.url) nlabel_bytes = nlabel.encode('utf-8') nstart = len(nbytes) nbytes.extend(nlabel_bytes) nend = len(nbytes) nfg = replace(fg, start=nstart, end=nend) nfragments.append(nfg) last_index = fg.end if last_index < len(btext): nbytes.extend(btext[last_index:]) return nbytes.decode('utf-8'), nfragments def split( self, text: str, fragments: list[Fragment] ) -> list[tuple[str, list[Fragment]]]: text, fragments = self.url_normalize(text, fragments) if self.tally_lenght((text, fragments)) <= self.climit: return [(text, fragments)]