import re from dataclasses import replace import grapheme from cross.tokens import LinkToken, TagToken, TextToken, Token def canonical_label(label: str | None, href: str): if not label or label == href: return True split = href.split("://", 1) if len(split) > 1: if split[1] == label: return True return False ALTERNATE = re.compile(r"\S+|\s+") def split_tokens( tokens: list[Token], max_chars: int, max_link_len: int = 35, ) -> list[list[Token]]: def new_block() -> None: nonlocal blocks, block, length if block: blocks.append(block) block, length = [], 0 def append_text(text: str) -> None: nonlocal block if block and isinstance(block[-1], TextToken): block[-1] = replace(block[-1], text=block[-1].text + text) else: block.append(TextToken(text=text)) blocks: list[list[Token]] = [] block: list[Token] = [] length: int = 0 for tk in tokens: if isinstance(tk, TagToken): tag_len = 1 + grapheme.length(tk.tag) if length + tag_len > max_chars: new_block() block.append(tk) length += tag_len continue if isinstance(tk, LinkToken): label_text = tk.label or "" link_len = grapheme.length(label_text) if canonical_label(tk.label, tk.href): link_len = min(link_len, max_link_len) if length + link_len <= max_chars: block.append(tk) length += link_len continue if length: new_block() remaining = label_text while remaining: room = ( max_chars - length - (0 if grapheme.length(remaining) <= max_chars else 1) ) chunk = grapheme.slice(remaining, 0, room) if grapheme.length(remaining) > room: chunk += "-" block.append(replace(tk, label=chunk)) length += grapheme.length(chunk) remaining = grapheme.slice(remaining, room, grapheme.length(remaining)) if remaining: new_block() continue if isinstance(tk, TextToken): for seg in ALTERNATE.findall(tk.text): seg_len = grapheme.length(seg) if length + seg_len <= max_chars - (0 if seg.isspace() else 1): append_text(seg) length += seg_len continue if length: new_block() if not seg.isspace(): while grapheme.length(seg) > max_chars - 1: chunk = grapheme.slice(seg, 0, max_chars - 1) + "-" append_text(chunk) new_block() seg = grapheme.slice(seg, max_chars - 1, grapheme.length(seg)) else: while grapheme.length(seg) > max_chars: chunk = grapheme.slice(seg, 0, max_chars) append_text(chunk) new_block() seg = grapheme.slice(seg, max_chars, grapheme.length(seg)) if seg: append_text(seg) length = grapheme.length(seg) continue block.append(tk) if block: blocks.append(block) return blocks