social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
2
3
4def tokenize_post(text: str, facets: list[dict]) -> list[Token]:
5 def decode(ut8: bytes) -> str:
6 return ut8.decode(encoding="utf-8")
7
8 if not text:
9 return []
10 ut8_text = text.encode(encoding="utf-8")
11 if not facets:
12 return [TextToken(text=decode(ut8_text))]
13
14 slices: list[tuple[int, int, str, str]] = []
15
16 for facet in facets:
17 features: list[dict] = facet.get("features", [])
18 if not features:
19 continue
20
21 # we don't support overlapping facets/features
22 feature = features[0]
23 feature_type = feature["$type"]
24 index = facet["index"]
25 match feature_type:
26 case "app.bsky.richtext.facet#tag":
27 slices.append(
28 (index["byteStart"], index["byteEnd"], "tag", feature["tag"])
29 )
30 case "app.bsky.richtext.facet#link":
31 slices.append(
32 (index["byteStart"], index["byteEnd"], "link", feature["uri"])
33 )
34 case "app.bsky.richtext.facet#mention":
35 slices.append(
36 (index["byteStart"], index["byteEnd"], "mention", feature["did"])
37 )
38
39 if not slices:
40 return [TextToken(text=decode(ut8_text))]
41
42 slices.sort(key=lambda s: s[0])
43 unique: list[tuple[int, int, str, str]] = []
44 current_end = 0
45 for start, end, ttype, val in slices:
46 if start >= current_end:
47 unique.append((start, end, ttype, val))
48 current_end = end
49
50 if not unique:
51 return [TextToken(text=decode(ut8_text))]
52
53 tokens: list[Token] = []
54 prev = 0
55
56 for start, end, ttype, val in unique:
57 if start > prev:
58 # text between facets
59 tokens.append(TextToken(text=decode(ut8_text[prev:start])))
60 # facet token
61 match ttype:
62 case "link":
63 label = decode(ut8_text[start:end])
64
65 # try to unflatten links
66 split = val.split("://", 1)
67 if len(split) > 1:
68 if split[1].startswith(label):
69 tokens.append(LinkToken(href=val))
70 prev = end
71 continue
72
73 if label.endswith("...") and split[1].startswith(label[:-3]):
74 tokens.append(LinkToken(href=val))
75 prev = end
76 continue
77
78 tokens.append(LinkToken(href=val, label=label))
79 case "tag":
80 tag = decode(ut8_text[start:end])
81 tokens.append(TagToken(tag=tag[1:] if tag.startswith("#") else tag))
82 case "mention":
83 mention = decode(ut8_text[start:end])
84 tokens.append(
85 MentionToken(
86 username=mention[1:] if mention.startswith("@") else mention,
87 uri=val,
88 )
89 )
90 prev = end
91
92 if prev < len(ut8_text):
93 tokens.append(TextToken(text=decode(ut8_text[prev:])))
94
95 return tokens