commit 0cd814a8dd5b58472c665aa87440334fb6d6fac3 · zenfyr.dev/xpost

+63 -109
cross.py
···

       221
       221
        
           return tokens

     

       222
       222
        
       

     

       223
       223
        
       def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]:

     

       224
       224
       -
           def start_new_block():

     

       225
       225
       -
               nonlocal current_block, blocks, current_length

     

       226
       226
       -
               if current_block:

     

       227
       227
       -
                   blocks.append(current_block)

     

       228
       228
       -
               current_block = []

     

       229
       229
       -
               current_length = 0

     

       230
       230
       -
       

     

       231
       231
       -
           def append_text_to_block(text_segment):

     

       232
       232
       -
               nonlocal current_block

     

       224
       224
       +
           def new_block():

     

       225
       225
       +
               nonlocal blocks, block, length

     

       226
       226
       +
               if block:

     

       227
       227
       +
                   blocks.append(block)

     

       228
       228
       +
               block = []

     

       229
       229
       +
               length = 0

     

       230
       230
       +
           

     

       231
       231
       +
           def append_text(text_segment):

     

       232
       232
       +
               nonlocal block

     

       233
       233
        
               # if the last element in the current block is also text, just append to it

     

       234
       234
       -
               if current_block and isinstance(current_block[-1], TextToken):

     

       235
       235
       -
                   current_block[-1].text += text_segment

     

       234
       234
       +
               if block and isinstance(block[-1], TextToken):

     

       235
       235
       +
                   block[-1].text += text_segment

     

       236
       236
        
               else:

     

       237
       237
       -
                   current_block.append(TextToken(text_segment))

     

       237
       237
       +
                   block.append(TextToken(text_segment))

     

       238
       238
        
           

     

       239
       239
        
           blocks: list[list[Token]] = []

     

       240
       240
       -
           current_block: list[Token] = []

     

       241
       241
       -
           current_length: int = 0

     

       242
       242
       -
       

     

       243
       243
       -
           for token in tokens:

     

       244
       244
       -
               if isinstance(token, TextToken):

     

       245
       245
       -
                   # split content into alternating “words” (\S+) and “whitespace” (\s+).

     

       246
       246
       -
                   # this ensures every space/newline is treated as its own segment.

     

       247
       247
       -
                   segments: list[str] = ALTERNATE.findall(token.text)

     

       248
       248
       -
       

     

       240
       240
       +
           block: list[Token] = []

     

       241
       241
       +
           length = 0

     

       242
       242
       +
           

     

       243
       243
       +
           for tk in tokens: # other token types are currently not supported

     

       244
       244
       +
               if isinstance(tk, TagToken):

     

       245
       245
       +
                   tag_len = 1 + len(tk.tag) # (#) + tag

     

       246
       246
       +
                   if length + tag_len > max_chars:

     

       247
       247
       +
                       new_block() # create new block if the current one is too large

     

       248
       248
       +
                   

     

       249
       249
       +
                   block.append(tk)

     

       250
       250
       +
                   length += tag_len

     

       251
       251
       +
               elif isinstance(tk, LinkToken): # TODO labels should proably be split too

     

       252
       252
       +
                   link_len = len(tk.label)

     

       253
       253
       +
                   if canonical_label(tk.label, tk.href): # cut down the link if the label is canonical

     

       254
       254
       +
                       link_len = min(link_len, max_link_len)

     

       255
       255
       +
                   

     

       256
       256
       +
                   if length + link_len > max_chars:

     

       257
       257
       +
                       new_block()

     

       258
       258
       +
                   block.append(tk)

     

       259
       259
       +
                   length += link_len

     

       260
       260
       +
               elif isinstance(tk, TextToken):

     

       261
       261
       +
                   segments: list[str] = ALTERNATE.findall(tk.text)

     

       262
       262
       +
                   

     

       249
       263
        
                   for seg in segments:

     

       250
       250
       -
                       if seg.isspace():

     

       251
       251
       -
                           # whitespace segment: we count it, and if it doesn't fully fit,

     

       252
       252
       -
                           # split the whitespace across blocks to preserve exact spacing.

     

       253
       253
       -
                           seg_len: int = len(seg)

     

       254
       254
       -
                           while seg_len > 0:

     

       255
       255
       -
                               space_left = max_chars - current_length

     

       256
       256
       -
                               if space_left == 0:

     

       257
       257
       -
                                   start_new_block()

     

       258
       258
       -
                                   continue

     

       259
       259
       -
       

     

       260
       260
       -
                               take = min(space_left, seg_len)

     

       261
       261
       -
                               part = seg[:take]

     

       262
       262
       -
                               append_text_to_block(part)

     

       263
       263
       -
       

     

       264
       264
       -
                               current_length += len(part)

     

       265
       265
       -
                               seg = seg[take:]

     

       266
       266
       -
                               seg_len -= take

     

       267
       267
       -
       

     

       268
       268
       -
                               if current_length == max_chars:

     

       269
       269
       -
                                   start_new_block()

     

       270
       270
       -
       

     

       264
       264
       +
                       seg_len: int = len(seg)

     

       265
       265
       +
                       if length + seg_len <= max_chars - (0 if seg.isspace() else 1):

     

       266
       266
       +
                           append_text(seg)

     

       267
       267
       +
                           length += seg_len

     

       268
       268
       +
                           continue

     

       269
       269
       +
                       

     

       270
       270
       +
                       if length > 0:

     

       271
       271
       +
                           new_block()

     

       272
       272
       +
                       

     

       273
       273
       +
                       if not seg.isspace():

     

       274
       274
       +
                           while len(seg) > max_chars - 1:

     

       275
       275
       +
                               chunk = seg[: max_chars - 1] + "-"

     

       276
       276
       +
                               append_text(chunk)

     

       277
       277
       +
                               new_block()

     

       278
       278
       +
                               seg = seg[max_chars - 1 :]

     

       271
       279
        
                       else:

     

       272
       272
       -
                           # seg is a “word” (no whitespace inside).

     

       273
       273
       -
                           word: str = seg

     

       274
       274
       -
                           wlen: int = len(word)

     

       275
       275
       -
       

     

       276
       276
       -
                           # if the word itself is longer than n, we must split it with hyphens.

     

       277
       277
       -
                           if wlen > max_chars:

     

       278
       278
       -
                               # first, if we're in the middle of a block, close it & start fresh.

     

       279
       279
       -
                               if current_length > 0:

     

       280
       280
       -
                                   start_new_block()

     

       281
       281
       -
       

     

       282
       282
       -
                               remaining = word

     

       283
       283
       -
                               # carve off (n-1)-sized chunks + “-” so each chunk is n chars.

     

       284
       284
       -
                               while len(remaining) > (max_chars - 1):

     

       285
       285
       -
                                   chunk = remaining[: max_chars - 1] + '-'

     

       286
       286
       -
                                   append_text_to_block(chunk)

     

       287
       287
       -
                                   # that chunk fills the current block

     

       288
       288
       -
                                   start_new_block()

     

       289
       289
       -
                                   remaining = remaining[max_chars - 1 :]

     

       290
       290
       -
       

     

       291
       291
       -
                               # now whatever remains is ≤ n characters

     

       292
       292
       -
                               if remaining:

     

       293
       293
       -
                                   append_text_to_block(remaining)

     

       294
       294
       -
                                   current_length = len(remaining)

     

       295
       295
       -
       

     

       296
       296
       -
                           else:

     

       297
       297
       -
                               # word fits fully within a block (≤ n).

     

       298
       298
       -
                               if current_length + wlen <= max_chars:

     

       299
       299
       -
                                   append_text_to_block(word)

     

       300
       300
       -
                                   current_length += wlen

     

       301
       301
       -
                               else:

     

       302
       302
       -
                                   # not enough space in current block → start a new one

     

       303
       303
       -
                                   start_new_block()

     

       304
       304
       -
                                   append_text_to_block(word)

     

       305
       305
       -
                                   current_length = wlen

     

       306
       306
       -
       

     

       307
       307
       -
               elif isinstance(token, LinkToken):

     

       308
       308
       -
                   link_len = len(token.label)

     

       309
       309
       -
                   if canonical_label(token.label, token.href):

     

       310
       310
       -
                       link_len = min(link_len, max_link_len)

     

       311
       311
       -
       

     

       312
       312
       -
                   if current_length + link_len <= max_chars:

     

       313
       313
       -
                       current_block.append(token)

     

       314
       314
       -
                       current_length += link_len

     

       315
       315
       -
                   else:

     

       316
       316
       -
                       start_new_block()

     

       317
       317
       -
                       current_block.append(token)

     

       318
       318
       -
                       current_length = link_len

     

       319
       319
       -
       

     

       320
       320
       -
               elif isinstance(token, TagToken):

     

       321
       321
       -
                   # we treat a hashtag like “#tagname” for counting.

     

       322
       322
       -
                   hashtag_len = 1 + len(token.tag)

     

       323
       323
       -
                   if current_length + hashtag_len <= max_chars:

     

       324
       324
       -
                       current_block.append(token)

     

       325
       325
       -
                       current_length += hashtag_len

     

       326
       326
       -
                   else:

     

       327
       327
       -
                       start_new_block()

     

       328
       328
       -
                       current_block.append(token)

     

       329
       329
       -
                       current_length = hashtag_len

     

       330
       330
       -
       

     

       331
       331
       -
               else:

     

       332
       332
       -
                   # if you happen to have other types, just append them without affecting length.

     

       333
       333
       -
                   current_block.append(token)

     

       334
       334
       -
       

     

       335
       335
       -
           # append any remaining tokens as the final block

     

       336
       336
       -
           if current_block:

     

       337
       337
       -
               blocks.append(current_block)

     

       338
       338
       -
       

     

       280
       280
       +
                           while len(seg) > max_chars:

     

       281
       281
       +
                               chunk = seg[: max_chars]

     

       282
       282
       +
                               append_text(chunk)

     

       283
       283
       +
                               new_block()

     

       284
       284
       +
                               seg = seg[max_chars :]

     

       285
       285
       +
                       

     

       286
       286
       +
                       if seg:

     

       287
       287
       +
                           append_text(seg)

     

       288
       288
       +
                           length = len(seg)

     

       289
       289
       +
           

     

       290
       290
       +
           if block:

     

       291
       291
       +
               blocks.append(block)

     

       292
       292
       +
           

     

       339
       293
        
           return blocks