social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky

rewrite: split_tokens

zenfyr.dev 0cd814a8 a333e157

verified
Changed files
+63 -109
+63 -109
cross.py
···
return tokens
def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]:
-
def start_new_block():
-
nonlocal current_block, blocks, current_length
-
if current_block:
-
blocks.append(current_block)
-
current_block = []
-
current_length = 0
-
-
def append_text_to_block(text_segment):
-
nonlocal current_block
+
def new_block():
+
nonlocal blocks, block, length
+
if block:
+
blocks.append(block)
+
block = []
+
length = 0
+
+
def append_text(text_segment):
+
nonlocal block
# if the last element in the current block is also text, just append to it
-
if current_block and isinstance(current_block[-1], TextToken):
-
current_block[-1].text += text_segment
+
if block and isinstance(block[-1], TextToken):
+
block[-1].text += text_segment
else:
-
current_block.append(TextToken(text_segment))
+
block.append(TextToken(text_segment))
blocks: list[list[Token]] = []
-
current_block: list[Token] = []
-
current_length: int = 0
-
-
for token in tokens:
-
if isinstance(token, TextToken):
-
# split content into alternating “words” (\S+) and “whitespace” (\s+).
-
# this ensures every space/newline is treated as its own segment.
-
segments: list[str] = ALTERNATE.findall(token.text)
-
+
block: list[Token] = []
+
length = 0
+
+
for tk in tokens: # other token types are currently not supported
+
if isinstance(tk, TagToken):
+
tag_len = 1 + len(tk.tag) # (#) + tag
+
if length + tag_len > max_chars:
+
new_block() # create new block if the current one is too large
+
+
block.append(tk)
+
length += tag_len
+
elif isinstance(tk, LinkToken): # TODO labels should proably be split too
+
link_len = len(tk.label)
+
if canonical_label(tk.label, tk.href): # cut down the link if the label is canonical
+
link_len = min(link_len, max_link_len)
+
+
if length + link_len > max_chars:
+
new_block()
+
block.append(tk)
+
length += link_len
+
elif isinstance(tk, TextToken):
+
segments: list[str] = ALTERNATE.findall(tk.text)
+
for seg in segments:
-
if seg.isspace():
-
# whitespace segment: we count it, and if it doesn't fully fit,
-
# split the whitespace across blocks to preserve exact spacing.
-
seg_len: int = len(seg)
-
while seg_len > 0:
-
space_left = max_chars - current_length
-
if space_left == 0:
-
start_new_block()
-
continue
-
-
take = min(space_left, seg_len)
-
part = seg[:take]
-
append_text_to_block(part)
-
-
current_length += len(part)
-
seg = seg[take:]
-
seg_len -= take
-
-
if current_length == max_chars:
-
start_new_block()
-
+
seg_len: int = len(seg)
+
if length + seg_len <= max_chars - (0 if seg.isspace() else 1):
+
append_text(seg)
+
length += seg_len
+
continue
+
+
if length > 0:
+
new_block()
+
+
if not seg.isspace():
+
while len(seg) > max_chars - 1:
+
chunk = seg[: max_chars - 1] + "-"
+
append_text(chunk)
+
new_block()
+
seg = seg[max_chars - 1 :]
else:
-
# seg is a “word” (no whitespace inside).
-
word: str = seg
-
wlen: int = len(word)
-
-
# if the word itself is longer than n, we must split it with hyphens.
-
if wlen > max_chars:
-
# first, if we're in the middle of a block, close it & start fresh.
-
if current_length > 0:
-
start_new_block()
-
-
remaining = word
-
# carve off (n-1)-sized chunks + “-” so each chunk is n chars.
-
while len(remaining) > (max_chars - 1):
-
chunk = remaining[: max_chars - 1] + '-'
-
append_text_to_block(chunk)
-
# that chunk fills the current block
-
start_new_block()
-
remaining = remaining[max_chars - 1 :]
-
-
# now whatever remains is ≤ n characters
-
if remaining:
-
append_text_to_block(remaining)
-
current_length = len(remaining)
-
-
else:
-
# word fits fully within a block (≤ n).
-
if current_length + wlen <= max_chars:
-
append_text_to_block(word)
-
current_length += wlen
-
else:
-
# not enough space in current block → start a new one
-
start_new_block()
-
append_text_to_block(word)
-
current_length = wlen
-
-
elif isinstance(token, LinkToken):
-
link_len = len(token.label)
-
if canonical_label(token.label, token.href):
-
link_len = min(link_len, max_link_len)
-
-
if current_length + link_len <= max_chars:
-
current_block.append(token)
-
current_length += link_len
-
else:
-
start_new_block()
-
current_block.append(token)
-
current_length = link_len
-
-
elif isinstance(token, TagToken):
-
# we treat a hashtag like “#tagname” for counting.
-
hashtag_len = 1 + len(token.tag)
-
if current_length + hashtag_len <= max_chars:
-
current_block.append(token)
-
current_length += hashtag_len
-
else:
-
start_new_block()
-
current_block.append(token)
-
current_length = hashtag_len
-
-
else:
-
# if you happen to have other types, just append them without affecting length.
-
current_block.append(token)
-
-
# append any remaining tokens as the final block
-
if current_block:
-
blocks.append(current_block)
-
+
while len(seg) > max_chars:
+
chunk = seg[: max_chars]
+
append_text(chunk)
+
new_block()
+
seg = seg[max_chars :]
+
+
if seg:
+
append_text(seg)
+
length = len(seg)
+
+
if block:
+
blocks.append(block)
+
return blocks