···
def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]:
224
-
def start_new_block():
225
-
nonlocal current_block, blocks, current_length
227
-
blocks.append(current_block)
231
-
def append_text_to_block(text_segment):
232
-
nonlocal current_block
225
+
nonlocal blocks, block, length
227
+
blocks.append(block)
231
+
def append_text(text_segment):
# if the last element in the current block is also text, just append to it
234
-
if current_block and isinstance(current_block[-1], TextToken):
235
-
current_block[-1].text += text_segment
234
+
if block and isinstance(block[-1], TextToken):
235
+
block[-1].text += text_segment
237
-
current_block.append(TextToken(text_segment))
237
+
block.append(TextToken(text_segment))
blocks: list[list[Token]] = []
240
-
current_block: list[Token] = []
241
-
current_length: int = 0
243
-
for token in tokens:
244
-
if isinstance(token, TextToken):
245
-
# split content into alternating “words” (\S+) and “whitespace” (\s+).
246
-
# this ensures every space/newline is treated as its own segment.
247
-
segments: list[str] = ALTERNATE.findall(token.text)
240
+
block: list[Token] = []
243
+
for tk in tokens: # other token types are currently not supported
244
+
if isinstance(tk, TagToken):
245
+
tag_len = 1 + len(tk.tag) # (#) + tag
246
+
if length + tag_len > max_chars:
247
+
new_block() # create new block if the current one is too large
251
+
elif isinstance(tk, LinkToken): # TODO labels should proably be split too
252
+
link_len = len(tk.label)
253
+
if canonical_label(tk.label, tk.href): # cut down the link if the label is canonical
254
+
link_len = min(link_len, max_link_len)
256
+
if length + link_len > max_chars:
260
+
elif isinstance(tk, TextToken):
261
+
segments: list[str] = ALTERNATE.findall(tk.text)
251
-
# whitespace segment: we count it, and if it doesn't fully fit,
252
-
# split the whitespace across blocks to preserve exact spacing.
253
-
seg_len: int = len(seg)
255
-
space_left = max_chars - current_length
256
-
if space_left == 0:
260
-
take = min(space_left, seg_len)
262
-
append_text_to_block(part)
264
-
current_length += len(part)
268
-
if current_length == max_chars:
264
+
seg_len: int = len(seg)
265
+
if length + seg_len <= max_chars - (0 if seg.isspace() else 1):
273
+
if not seg.isspace():
274
+
while len(seg) > max_chars - 1:
275
+
chunk = seg[: max_chars - 1] + "-"
278
+
seg = seg[max_chars - 1 :]
272
-
# seg is a “word” (no whitespace inside).
274
-
wlen: int = len(word)
276
-
# if the word itself is longer than n, we must split it with hyphens.
277
-
if wlen > max_chars:
278
-
# first, if we're in the middle of a block, close it & start fresh.
279
-
if current_length > 0:
283
-
# carve off (n-1)-sized chunks + “-” so each chunk is n chars.
284
-
while len(remaining) > (max_chars - 1):
285
-
chunk = remaining[: max_chars - 1] + '-'
286
-
append_text_to_block(chunk)
287
-
# that chunk fills the current block
289
-
remaining = remaining[max_chars - 1 :]
291
-
# now whatever remains is ≤ n characters
293
-
append_text_to_block(remaining)
294
-
current_length = len(remaining)
297
-
# word fits fully within a block (≤ n).
298
-
if current_length + wlen <= max_chars:
299
-
append_text_to_block(word)
300
-
current_length += wlen
302
-
# not enough space in current block → start a new one
304
-
append_text_to_block(word)
305
-
current_length = wlen
307
-
elif isinstance(token, LinkToken):
308
-
link_len = len(token.label)
309
-
if canonical_label(token.label, token.href):
310
-
link_len = min(link_len, max_link_len)
312
-
if current_length + link_len <= max_chars:
313
-
current_block.append(token)
314
-
current_length += link_len
317
-
current_block.append(token)
318
-
current_length = link_len
320
-
elif isinstance(token, TagToken):
321
-
# we treat a hashtag like “#tagname” for counting.
322
-
hashtag_len = 1 + len(token.tag)
323
-
if current_length + hashtag_len <= max_chars:
324
-
current_block.append(token)
325
-
current_length += hashtag_len
328
-
current_block.append(token)
329
-
current_length = hashtag_len
332
-
# if you happen to have other types, just append them without affecting length.
333
-
current_block.append(token)
335
-
# append any remaining tokens as the final block
337
-
blocks.append(current_block)
280
+
while len(seg) > max_chars:
281
+
chunk = seg[: max_chars]
284
+
seg = seg[max_chars :]
291
+
blocks.append(block)