from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token def tokenize_post(text: str, facets: list[dict]) -> list[Token]: def decode(ut8: bytes) -> str: return ut8.decode(encoding="utf-8") if not text: return [] ut8_text = text.encode(encoding="utf-8") if not facets: return [TextToken(text=decode(ut8_text))] slices: list[tuple[int, int, str, str]] = [] for facet in facets: features: list[dict] = facet.get("features", []) if not features: continue # we don't support overlapping facets/features feature = features[0] feature_type = feature["$type"] index = facet["index"] match feature_type: case "app.bsky.richtext.facet#tag": slices.append( (index["byteStart"], index["byteEnd"], "tag", feature["tag"]) ) case "app.bsky.richtext.facet#link": slices.append( (index["byteStart"], index["byteEnd"], "link", feature["uri"]) ) case "app.bsky.richtext.facet#mention": slices.append( (index["byteStart"], index["byteEnd"], "mention", feature["did"]) ) if not slices: return [TextToken(text=decode(ut8_text))] slices.sort(key=lambda s: s[0]) unique: list[tuple[int, int, str, str]] = [] current_end = 0 for start, end, ttype, val in slices: if start >= current_end: unique.append((start, end, ttype, val)) current_end = end if not unique: return [TextToken(text=decode(ut8_text))] tokens: list[Token] = [] prev = 0 for start, end, ttype, val in unique: if start > prev: # text between facets tokens.append(TextToken(text=decode(ut8_text[prev:start]))) # facet token match ttype: case "link": label = decode(ut8_text[start:end]) # try to unflatten links split = val.split("://", 1) if len(split) > 1: if split[1].startswith(label): tokens.append(LinkToken(href=val)) prev = end continue if label.endswith("...") and split[1].startswith(label[:-3]): tokens.append(LinkToken(href=val)) prev = end continue tokens.append(LinkToken(href=val, label=label)) case "tag": tag = decode(ut8_text[start:end]) tokens.append(TagToken(tag=tag[1:] if tag.startswith("#") else tag)) case "mention": mention = decode(ut8_text[start:end]) tokens.append( MentionToken( username=mention[1:] if mention.startswith("@") else mention, uri=val, ) ) prev = end if prev < len(ut8_text): tokens.append(TextToken(text=decode(ut8_text[prev:]))) return tokens