commit 866a152a55e4f8e9e8ef305985d5bcae121e4b85 · zenfyr.dev/xpost

+73 -102

bluesky.py

···

       99
       99
        
           return tokens

     

       100
       100
        
       

     

       101
       101
        
       class BlueskyPost(cross.Post):

     

       102
       102
       -
           def __init__(self, pds_url: str, did: str, post: dict) -> None:

     

       102
       102
       +
           def __init__(self, post: dict, attachments: list[media_util.MediaInfo]) -> None:

     

       103
       103
        
               super().__init__()

     

       104
       104
        
               self.post = post

     

       105
       105
        
               self.tokens = tokenize_post(post)

     
···

       114
       114
        
               self.cw = ''

     

       115
       115
        
               if labels:

     

       116
       116
        
                   self.cw = ', '.join([str(label['val']).replace('-', ' ') for label in labels])

     

       117
       117
       -
               

     

       118
       118
       -
               def get_blob_url(blob: str):

     

       119
       119
       -
                   nonlocal pds_url, did

     

       120
       120
       -
                   return f'{pds_url}/xrpc/com.atproto.sync.getBlob?did={did}&cid={blob}'

     

       121
       121
       -
               

     

       122
       122
       -
               attachments: list[cross.MediaAttachment] = []

     

       123
       123
       -
               embed = self.post.get('embed', {})

     

       124
       124
       -
               if embed.get('$type') == 'app.bsky.embed.images':

     

       125
       125
       -
                   model = get_model_or_create(embed, model=models.AppBskyEmbedImages.Main)

     

       126
       126
       -
                   assert isinstance(model, models.AppBskyEmbedImages.Main)

     

       127
       127
       -
                   

     

       128
       128
       -
                   for image in model.images:

     

       129
       129
       -
                       attachments.append(BlueskyAttachment(

     

       130
       130
       -
                           get_blob_url(image.image.cid.encode()),

     

       131
       131
       -
                           'image', image.alt

     

       132
       132
       -
                       ))

     

       133
       133
       -
               elif embed.get('$type') == 'app.bsky.embed.video':

     

       134
       134
       -
                   model = get_model_or_create(embed, model=models.AppBskyEmbedVideo.Main)

     

       135
       135
       -
                   assert isinstance(model, models.AppBskyEmbedVideo.Main)

     

       136
       136
       -
                   

     

       137
       137
       -
                   attachments.append(BlueskyAttachment(

     

       138
       138
       -
                       get_blob_url(model.video.cid.encode()),

     

       139
       139
       -
                       'video', model.alt if model.alt else ''

     

       140
       140
       -
                   ))

     

       141
       117
        
               self.attachments = attachments

     

       142
       118
        
           

     

       143
       119
        
           def get_tokens(self) -> list[cross.Token]:

     
···

       161
       137
        
           def is_sensitive(self) -> bool:

     

       162
       138
        
               return self.post.get('labels', {}).get('values') or False

     

       163
       139
        
           

     

       164
       164
       -
           def get_attachments(self) -> list[cross.MediaAttachment]:

     

       165
       165
       -
               return self.attachments or []

     

       166
       166
       -
       

     

       167
       167
       -
       class BlueskyAttachment(cross.MediaAttachment):

     

       168
       168
       -
           def __init__(self, url: str, type: str, alt: str) -> None:

     

       169
       169
       -
               super().__init__()

     

       170
       170
       -
               self.url = url

     

       171
       171
       -
               self.type = type

     

       172
       172
       -
               self.alt = alt

     

       173
       173
       -
           

     

       174
       174
       -
           def get_url(self) -> str:

     

       175
       175
       -
               return self.url

     

       176
       176
       -
       

     

       177
       177
       -
           def get_type(self) -> str | None:

     

       178
       178
       -
               return self.type

     

       179
       179
       -
           

     

       180
       180
       -
           def create_meta(self, bytes: bytes) -> cross.MediaMeta:

     

       181
       181
       -
               o_meta = media_util.get_media_meta(bytes)

     

       182
       182
       -
               return cross.MediaMeta(o_meta['width'], o_meta['height'], o_meta.get('duration', -1))

     

       183
       183
       -
           

     

       184
       184
       -
           def get_alt(self) -> str:

     

       185
       185
       -
               return self.alt

     

       140
       140
       +
           def get_attachments(self) -> list[media_util.MediaInfo]:

     

       141
       141
       +
               return self.attachments

     

       186
       142
        
       

     

       187
       143
        
       class BlueskyInput(cross.Input):

     

       188
       144
        
           def __init__(self, settings: dict, db: DataBaseWorker) -> None:

     
···

       210
       166
        
                   return

     

       211
       167
        
               

     

       212
       168
        
               LOGGER.info("Crossposting '%s'...", post_ref)

     

       213
       213
       -
               cross_post = BlueskyPost(self.pds, self.user_id, post)

     

       169
       169
       +
               

     

       170
       170
       +
               def get_blob_url(blob: str):

     

       171
       171
       +
                   return f'{self.pds}/xrpc/com.atproto.sync.getBlob?did={self.user_id}&cid={blob}'

     

       172
       172
       +
               

     

       173
       173
       +
               attachments: list[media_util.MediaInfo] = []

     

       174
       174
       +
               embed = post.get('embed', {})

     

       175
       175
       +
               if embed.get('$type') == 'app.bsky.embed.images':

     

       176
       176
       +
                   model = get_model_or_create(embed, model=models.AppBskyEmbedImages.Main)

     

       177
       177
       +
                   assert isinstance(model, models.AppBskyEmbedImages.Main)

     

       178
       178
       +
                   

     

       179
       179
       +
                   for image in model.images:

     

       180
       180
       +
                       url = get_blob_url(image.image.cid.encode())

     

       181
       181
       +
                       LOGGER.info("Downloading %s...", url)

     

       182
       182
       +
                       io = media_util.download_media(url, image.alt)

     

       183
       183
       +
                       if not io:

     

       184
       184
       +
                           LOGGER.error("Skipping '%s'. Failed to download media!", post_ref)

     

       185
       185
       +
                           return

     

       186
       186
       +
                       attachments.append(io)

     

       187
       187
       +
               elif embed.get('$type') == 'app.bsky.embed.video':

     

       188
       188
       +
                   model = get_model_or_create(embed, model=models.AppBskyEmbedVideo.Main)

     

       189
       189
       +
                   assert isinstance(model, models.AppBskyEmbedVideo.Main)

     

       190
       190
       +
                   url = get_blob_url(model.video.cid.encode())

     

       191
       191
       +
                   LOGGER.info("Downloading %s...", url)

     

       192
       192
       +
                   io = media_util.download_media(url, model.alt if model.alt else '')

     

       193
       193
       +
                   if not io:

     

       194
       194
       +
                       LOGGER.error("Skipping '%s'. Failed to download media!", post_ref)

     

       195
       195
       +
                       return

     

       196
       196
       +
                   attachments.append(io)

     

       197
       197
       +
                   

     

       198
       198
       +
               cross_post = BlueskyPost(post, attachments)

     

       214
       199
        
               for output in outputs:

     

       215
       200
        
                   output.accept_post(cross_post)

     

       216
       201
        
               return

     
···

       317
       302
        
                   raise Exception("Account app password not provided!")

     

       318
       303
        
               

     

       319
       304
        
               did, pds = resolve_identity(

     

       320
       320
       -
                   handle=util.as_envvar(settings.get('hanlde')),

     

       305
       305
       +
                   handle=util.as_envvar(settings.get('handle')),

     

       321
       306
        
                   did=util.as_envvar(settings.get('did')),

     

       322
       307
        
                   pds=util.as_envvar(settings.get('pds'))

     

       323
       308
        
               )

     
···

       358
       343
        
                   thread_tuple[3]

     

       359
       344
        
               )

     

       360
       345
        
           

     

       361
       361
       -
           def _split_attachments(self, attachments: list[cross.MediaAttachment]):

     

       362
       362
       -
               sup_media: list[cross.MediaAttachment] = []

     

       363
       363
       -
               unsup_media: list[cross.MediaAttachment] = []

     

       346
       346
       +
           def _split_attachments(self, attachments: list[media_util.MediaInfo]):

     

       347
       347
       +
               sup_media: list[media_util.MediaInfo] = []

     

       348
       348
       +
               unsup_media: list[media_util.MediaInfo] = []

     

       364
       349
        
               

     

       365
       365
       -
               for attachment in attachments:

     

       366
       366
       -
                   attachment_type = attachment.get_type()

     

       367
       367
       -
                   if not attachment_type:

     

       368
       368
       -
                       continue

     

       369
       369
       -
       

     

       370
       370
       -
                   if attachment_type in {'video', 'image'}: # TODO convert gifs to videos

     

       371
       371
       -
                       sup_media.append(attachment)

     

       350
       350
       +
               for a in attachments:

     

       351
       351
       +
                   if a.mime.startswith('image/') or a.mime.startswith('video/'): # TODO convert gifs to videos

     

       352
       352
       +
                       sup_media.append(a)

     

       372
       353
        
                   else:

     

       373
       373
       -
                       unsup_media.append(attachment)

     

       354
       354
       +
                       unsup_media.append(a)

     

       374
       355
        
               

     

       375
       356
        
               return (sup_media, unsup_media)

     

       376
       357
        
       

     

       377
       358
        
           def _split_media_per_post(

     

       378
       359
        
               self, 

     

       379
       360
        
               tokens: list[client_utils.TextBuilder], 

     

       380
       380
       -
               media: list[cross.MediaAttachment]):

     

       361
       361
       +
               media: list[media_util.MediaInfo]):

     

       381
       362
        
               

     

       382
       363
        
               posts: list[dict] = [{"tokens": tokens, "attachments": []} for tokens in tokens]

     

       383
       364
        
               available_indices: list[int] = list(range(len(posts)))

     
···

       399
       380
        
                       return new_idx

     

       400
       381
        
               

     

       401
       382
        
               for att in media:

     

       402
       402
       -
                   if att.get_type() == 'video':

     

       383
       383
       +
                   if att.mime.startswith('video/'):

     

       403
       384
        
                       current_image_post_idx = None

     

       404
       385
        
                       idx = pop_next_empty_index()

     

       405
       386
        
                       posts[idx]["attachments"].append(att)

     

       406
       406
       -
                   elif att.get_type() == 'image':

     

       387
       387
       +
                   elif att.mime.startswith('image/'):

     

       407
       388
        
                       if (

     

       408
       389
        
                           current_image_post_idx is not None

     

       409
       390
        
                           and len(posts[current_image_post_idx]["attachments"]) < 4

     
···

       414
       395
        
                           posts[idx]["attachments"].append(att)

     

       415
       396
        
                           current_image_post_idx = idx

     

       416
       397
        
               

     

       417
       417
       -
               result: list[tuple[client_utils.TextBuilder, list[cross.MediaAttachment]]] = []

     

       398
       398
       +
               result: list[tuple[client_utils.TextBuilder, list[media_util.MediaInfo]]] = []

     

       418
       399
        
               for p in posts:

     

       419
       400
        
                   result.append((p["tokens"], p["attachments"]))

     

       420
       401
        
               return result

     
···

       464
       445
        
                       tokens.append(cross.TextToken('\n'))

     

       465
       446
        
                   for i, attachment in enumerate(unsup_media):

     

       466
       447
        
                       tokens.append(cross.LinkToken(

     

       467
       467
       -
                               attachment.get_url(),

     

       468
       468
       -
                               f"[{media_util.get_filename_from_url(attachment.get_url())}]"

     

       448
       448
       +
                               attachment.url,

     

       449
       449
       +
                               f"[{media_util.get_filename_from_url(attachment.url)}]"

     

       469
       450
        
                       ))

     

       470
       451
        
                       tokens.append(cross.TextToken(' '))

     

       471
       452
        
       

     

       472
       453
        
               

     

       473
       473
       -
               split_tokens: list[list[cross.Token]] = util.split_tokens(tokens, 300)

     

       454
       454
       +
               split_tokens: list[list[cross.Token]] = cross.split_tokens(tokens, 300)

     

       474
       455
        
               post_text: list[client_utils.TextBuilder] = []

     

       475
       456
        
               

     

       476
       457
        
               # convert tokens into rich text. skip post if contains unsupported tokens

     
···

       485
       466
        
               if not post_text:

     

       486
       467
        
                   post_text = [client_utils.TextBuilder().text('')]

     

       487
       468
        
               

     

       488
       488
       -
               # download media first. increased RAM usage, but more reliable

     

       489
       469
        
               for m in sup_media:

     

       490
       490
       -
                   if not m.bytes:

     

       491
       491
       -
                       if m.get_type() == 'image':

     

       492
       492
       -
                           image_bytes = media_util.download_blob(m.get_url(), max_bytes=2_000_000)

     

       493
       493
       -
                           if not image_bytes:

     

       494
       494
       -
                               LOGGER.error("Skipping post_id '%s', failed to download attachment! File too large?", post.get_id())

     

       495
       495
       -
                               return

     

       496
       496
       -
                           m.bytes = image_bytes

     

       497
       497
       -
                       elif m.get_type() == 'video':

     

       498
       498
       -
                           video_bytes = media_util.download_blob(m.get_url(), max_bytes=100_000_000)

     

       499
       499
       -
                           if not video_bytes:

     

       500
       500
       -
                               LOGGER.error("Skipping post_id '%s', failed to download attachment! File too large?", post.get_id())

     

       501
       501
       -
                               return

     

       502
       502
       -
                           m.bytes = video_bytes

     

       470
       470
       +
                   if m.mime.startswith('image/'):

     

       471
       471
       +
                       if len(m.io) > 2_000_000:

     

       472
       472
       +
                           LOGGER.error("Skipping post_id '%s', failed to download attachment! File too large.", post.get_id())

     

       473
       473
       +
                           return

     

       474
       474
       +
                   

     

       475
       475
       +
                   if m.mime.startswith('video/'):

     

       476
       476
       +
                       if len(m.io) > 100_000_000:

     

       477
       477
       +
                           LOGGER.error("Skipping post_id '%s', failed to download attachment! File too large?", post.get_id())

     

       478
       478
       +
                           return

     

       503
       479
        
               

     

       504
       480
        
               created_records: list[models.AppBskyFeedPost.CreateRecordResponse] = []

     

       505
       481
        
               baked_media = self._split_media_per_post(post_text, sup_media)

     
···

       520
       496
        
                       created_records.append(new_post)

     

       521
       497
        
                   else:

     

       522
       498
        
                       # if a single post is an image - everything else is an image

     

       523
       523
       -
                       if attachments[0].get_type() == 'image':

     

       499
       499
       +
                       if attachments[0].mime.startswith('image/'):

     

       524
       500
        
                           images: list[bytes] = []

     

       525
       501
        
                           image_alts: list[str] = []

     

       526
       502
        
                           image_aspect_ratios: list[models.AppBskyEmbedDefs.AspectRatio] = []

     

       527
       503
        
                           

     

       528
       504
        
                           for attachment in attachments:

     

       529
       529
       -
                               assert attachment.bytes

     

       530
       530
       -
                               image_io = media_util.compress_image(attachment.bytes, quality=100)

     

       531
       531
       -
                               metadata = attachment.create_meta(image_io)

     

       505
       505
       +
                               image_io = media_util.compress_image(attachment.io, quality=100)

     

       506
       506
       +
                               metadata = media_util.get_media_meta(image_io)

     

       532
       507
        
                           

     

       533
       508
        
                               if len(image_io) > 1_000_000:

     

       534
       534
       -
                                   LOGGER.info("Compressing %s...", attachment.get_url())

     

       509
       509
       +
                                   LOGGER.info("Compressing %s...", attachment.name)

     

       510
       510
       +
                                   image_io = media_util.compress_image(image_io)

     

       535
       511
        
                           

     

       536
       512
        
                               images.append(image_io)

     

       537
       537
       -
                               image_alts.append(attachment.get_alt())

     

       513
       513
       +
                               image_alts.append(attachment.alt)

     

       538
       514
        
                               image_aspect_ratios.append(models.AppBskyEmbedDefs.AspectRatio(

     

       539
       539
       -
                                   width=metadata.get_width(), 

     

       540
       540
       -
                                   height=metadata.get_height()

     

       515
       515
       +
                                   width=metadata['width'], 

     

       516
       516
       +
                                   height=metadata['height']

     

       541
       517
        
                               ))

     

       542
       518
        
                           

     

       543
       519
        
                           new_post = self.bsky.send_images(

     
···

       559
       535
        
                           reply_ref = models.create_strong_ref(new_post)

     

       560
       536
        
                           created_records.append(new_post)

     

       561
       537
        
                       else: # video is guarantedd to be one

     

       562
       562
       -
                           video_data = attachments[0]

     

       563
       563
       -
                           assert video_data.bytes

     

       564
       564
       -
                           video_io = video_data.bytes

     

       565
       565
       -
                           

     

       566
       566
       -
                           metadata = video_data.create_meta(video_io)

     

       567
       567
       -
                           if metadata.get_duration() > 180:

     

       538
       538
       +
                           metadata = media_util.get_media_meta(attachments[0].io)

     

       539
       539
       +
                           if metadata['duration'] > 180:

     

       568
       540
        
                               LOGGER.info("Skipping post_id '%s', video attachment too long!", post.get_id())

     

       569
       541
        
                               return

     

       570
       542
        
                       

     

       571
       571
       -
                           probe = media_util.probe_bytes(video_io)

     

       572
       572
       -
                           format_name = probe['format']['format_name']

     

       573
       573
       -
                           if 'mp4' not in format_name.split(','):

     

       574
       574
       -
                               LOGGER.error("Converting %s to mp4...", video_data.get_url())

     

       543
       543
       +
                           video_io = attachments[0].io

     

       544
       544
       +
                           if attachments[0].mime != 'video/mp4':

     

       545
       545
       +
                               LOGGER.error("Converting %s to mp4...", attachments[0].name)

     

       575
       546
        
                               video_io = media_util.convert_to_mp4(video_io)

     

       576
       547
        
                               

     

       577
       548
        
                           aspect_ratio = models.AppBskyEmbedDefs.AspectRatio(

     

       578
       578
       -
                               width=metadata.get_width(), 

     

       579
       579
       -
                               height=metadata.get_height()

     

       549
       549
       +
                               width=metadata['width'], 

     

       550
       550
       +
                               height=metadata['height']

     

       580
       551
        
                           )

     

       581
       552
        
                           

     

       582
       553
        
                           new_post = self.bsky.send_video(

     

       583
       554
        
                               text=post_text[0],

     

       584
       555
        
                               video=video_io,

     

       585
       556
        
                               video_aspect_ratio=aspect_ratio,

     

       586
       586
       -
                               video_alt=video_data.get_alt(),

     

       557
       557
       +
                               video_alt=attachments[0].alt,

     

       587
       558
        
                               reply_to= models.AppBskyFeedPost.ReplyRef(

     

       588
       559
        
                                   parent=reply_ref,

     

       589
       560
        
                                   root=root_ref

+125 -19

cross.py

···

       1
       1
        
       from typing import Callable, Any

     

       2
       2
        
       from database import DataBaseWorker

     

       3
       3
        
       from datetime import datetime, timezone

     

       4
       4
       +
       from media_util import MediaInfo, get_media_meta

     

       5
       5
       +
       import util

     

       6
       6
       +
       import re

     

       7
       7
       +
       

     

       8
       8
       +
       ALTERNATE = re.compile(r'\S+|\s+')

     

       4
       9
        
       

     

       5
       10
        
       # generic token

     

       6
       11
        
       class Token():

     
···

       46
       51
        
           

     

       47
       52
        
           def get_duration(self) -> float:

     

       48
       53
        
               return self.duration

     

       49
       49
       -
       

     

       50
       50
       -
       class MediaAttachment():

     

       51
       51
       -
           def __init__(self) -> None:

     

       52
       52
       -
               self.bytes: bytes | None = None # filled-in later

     

       53
       53
       -
               pass

     

       54
       54
       -
           

     

       55
       55
       -
           def create_meta(self, bytes: bytes) -> MediaMeta:

     

       56
       56
       -
               return MediaMeta(-1, -1, -1)

     

       57
       57
       -
           

     

       58
       58
       -
           def get_url(self) -> str:

     

       59
       59
       -
               return ''

     

       60
       60
       -
           

     

       61
       61
       -
           def get_type(self) -> str | None:

     

       62
       62
       -
               return None

     

       63
       63
       -
           

     

       64
       64
       -
           def get_alt(self) -> str:

     

       65
       65
       -
               return ''

     

       66
       54
        
           

     

       67
       55
        
       class Post():

     

       68
       56
        
           def __init__(self) -> None:

     
···

       77
       65
        
           def get_post_date_iso(self) -> str:

     

       78
       66
        
               return datetime.now(timezone.utc).isoformat()

     

       79
       67
        
           

     

       80
       80
       -
           def get_attachments(self) -> list[MediaAttachment]:

     

       68
       68
       +
           def get_attachments(self) -> list[MediaInfo]:

     

       81
       69
        
               return []

     

       82
       70
        
           

     

       83
       71
        
           def get_id(self) -> str:

     
···

       114
       102
        
               pass

     

       115
       103
        
           

     

       116
       104
        
           def delete_post(self, identifier: str):

     

       117
       117
       -
               pass
     

       105
       105
       +
               pass

     

       106
       106
       +
       

     

       107
       107
       +
       def split_tokens(tokens: list[Token], max_chars: int, max_link_len: int = 35) -> list[list[Token]]:

     

       108
       108
       +
           def start_new_block():

     

       109
       109
       +
               nonlocal current_block, blocks, current_length

     

       110
       110
       +
               if current_block:

     

       111
       111
       +
                   blocks.append(current_block)

     

       112
       112
       +
               current_block = []

     

       113
       113
       +
               current_length = 0

     

       114
       114
       +
       

     

       115
       115
       +
           def append_text_to_block(text_segment):

     

       116
       116
       +
               nonlocal current_block

     

       117
       117
       +
               # if the last element in the current block is also text, just append to it

     

       118
       118
       +
               if current_block and isinstance(current_block[-1], TextToken):

     

       119
       119
       +
                   current_block[-1].text += text_segment

     

       120
       120
       +
               else:

     

       121
       121
       +
                   current_block.append(TextToken(text_segment))

     

       122
       122
       +
           

     

       123
       123
       +
           blocks: list[list[Token]] = []

     

       124
       124
       +
           current_block: list[Token] = []

     

       125
       125
       +
           current_length: int = 0

     

       126
       126
       +
       

     

       127
       127
       +
           for token in tokens:

     

       128
       128
       +
               if isinstance(token, TextToken):

     

       129
       129
       +
                   # split content into alternating “words” (\S+) and “whitespace” (\s+).

     

       130
       130
       +
                   # this ensures every space/newline is treated as its own segment.

     

       131
       131
       +
                   segments: list[str] = ALTERNATE.findall(token.text)

     

       132
       132
       +
       

     

       133
       133
       +
                   for seg in segments:

     

       134
       134
       +
                       if seg.isspace():

     

       135
       135
       +
                           # whitespace segment: we count it, and if it doesn't fully fit,

     

       136
       136
       +
                           # split the whitespace across blocks to preserve exact spacing.

     

       137
       137
       +
                           seg_len: int = len(seg)

     

       138
       138
       +
                           while seg_len > 0:

     

       139
       139
       +
                               space_left = max_chars - current_length

     

       140
       140
       +
                               if space_left == 0:

     

       141
       141
       +
                                   start_new_block()

     

       142
       142
       +
                                   continue

     

       143
       143
       +
       

     

       144
       144
       +
                               take = min(space_left, seg_len)

     

       145
       145
       +
                               part = seg[:take]

     

       146
       146
       +
                               append_text_to_block(part)

     

       147
       147
       +
       

     

       148
       148
       +
                               current_length += len(part)

     

       149
       149
       +
                               seg = seg[take:]

     

       150
       150
       +
                               seg_len -= take

     

       151
       151
       +
       

     

       152
       152
       +
                               if current_length == max_chars:

     

       153
       153
       +
                                   start_new_block()

     

       154
       154
       +
       

     

       155
       155
       +
                       else:

     

       156
       156
       +
                           # seg is a “word” (no whitespace inside).

     

       157
       157
       +
                           word: str = seg

     

       158
       158
       +
                           wlen: int = len(word)

     

       159
       159
       +
       

     

       160
       160
       +
                           # if the word itself is longer than n, we must split it with hyphens.

     

       161
       161
       +
                           if wlen > max_chars:

     

       162
       162
       +
                               # first, if we're in the middle of a block, close it & start fresh.

     

       163
       163
       +
                               if current_length > 0:

     

       164
       164
       +
                                   start_new_block()

     

       165
       165
       +
       

     

       166
       166
       +
                               remaining = word

     

       167
       167
       +
                               # carve off (n-1)-sized chunks + “-” so each chunk is n chars.

     

       168
       168
       +
                               while len(remaining) > (max_chars - 1):

     

       169
       169
       +
                                   chunk = remaining[: max_chars - 1] + '-'

     

       170
       170
       +
                                   append_text_to_block(chunk)

     

       171
       171
       +
                                   # that chunk fills the current block

     

       172
       172
       +
                                   start_new_block()

     

       173
       173
       +
                                   remaining = remaining[max_chars - 1 :]

     

       174
       174
       +
       

     

       175
       175
       +
                               # now whatever remains is ≤ n characters

     

       176
       176
       +
                               if remaining:

     

       177
       177
       +
                                   append_text_to_block(remaining)

     

       178
       178
       +
                                   current_length = len(remaining)

     

       179
       179
       +
       

     

       180
       180
       +
                           else:

     

       181
       181
       +
                               # word fits fully within a block (≤ n).

     

       182
       182
       +
                               if current_length + wlen <= max_chars:

     

       183
       183
       +
                                   append_text_to_block(word)

     

       184
       184
       +
                                   current_length += wlen

     

       185
       185
       +
                               else:

     

       186
       186
       +
                                   # not enough space in current block → start a new one

     

       187
       187
       +
                                   start_new_block()

     

       188
       188
       +
                                   append_text_to_block(word)

     

       189
       189
       +
                                   current_length = wlen

     

       190
       190
       +
       

     

       191
       191
       +
               elif isinstance(token, LinkToken):

     

       192
       192
       +
                   link_len = len(token.label)

     

       193
       193
       +
                   if util.canonical_label(token.label, token.href):

     

       194
       194
       +
                       link_len = min(link_len, max_link_len)

     

       195
       195
       +
       

     

       196
       196
       +
                   if current_length + link_len <= max_chars:

     

       197
       197
       +
                       current_block.append(token)

     

       198
       198
       +
                       current_length += link_len

     

       199
       199
       +
                   else:

     

       200
       200
       +
                       start_new_block()

     

       201
       201
       +
                       current_block.append(token)

     

       202
       202
       +
                       current_length = link_len

     

       203
       203
       +
       

     

       204
       204
       +
               elif isinstance(token, TagToken):

     

       205
       205
       +
                   # we treat a hashtag like “#tagname” for counting.

     

       206
       206
       +
                   hashtag_len = 1 + len(token.tag)

     

       207
       207
       +
                   if current_length + hashtag_len <= max_chars:

     

       208
       208
       +
                       current_block.append(token)

     

       209
       209
       +
                       current_length += hashtag_len

     

       210
       210
       +
                   else:

     

       211
       211
       +
                       start_new_block()

     

       212
       212
       +
                       current_block.append(token)

     

       213
       213
       +
                       current_length = hashtag_len

     

       214
       214
       +
       

     

       215
       215
       +
               else:

     

       216
       216
       +
                   # if you happen to have other types, just append them without affecting length.

     

       217
       217
       +
                   current_block.append(token)

     

       218
       218
       +
       

     

       219
       219
       +
           # append any remaining tokens as the final block

     

       220
       220
       +
           if current_block:

     

       221
       221
       +
               blocks.append(current_block)

     

       222
       222
       +
       

     

       223
       223
       +
           return blocks

+1 -1

main.py

···

       105
       105
        
           input = INPUTS[input_settings['type']](input_settings, db_worker)

     

       106
       106
        
           

     

       107
       107
        
           if not outputs_settings:

     

       108
       108
       -
               LOGGER.warning("No outputs specified! Check your config!")

     

       108
       108
       +
               LOGGER.warning("No outputs specified! Check the config!")

     

       109
       109
        
           

     

       110
       110
        
           outputs: list[cross.Output] = []

     

       111
       111
        
           for output_settings in outputs_settings:

+45 -88

mastodon.py

···

       5
       5
        
       from database import DataBaseWorker

     

       6
       6
        
       from typing import Callable, Any

     

       7
       7
        
       import asyncio, time

     

       8
       8
       -
       import magic

     

       9
       8
        
       

     

       10
       9
        
       from bs4 import BeautifulSoup, Tag

     

       11
       10
        
       from bs4.element import NavigableString

     
···

       18
       17
        
           'audio': 'audio',

     

       19
       18
        
           'unknown': 'other'

     

       20
       19
        
       }

     

       20
       20
       +
       POSSIBLE_MIMES = [

     

       21
       21
       +
           'audio/ogg',

     

       22
       22
       +
           'audio/mp3',

     

       23
       23
       +
           'image/webp',

     

       24
       24
       +
           'image/jpeg',

     

       25
       25
       +
           'image/png',

     

       26
       26
       +
           'video/mp4',

     

       27
       27
       +
           'video/quicktime',

     

       28
       28
       +
           'video/webm'

     

       29
       29
       +
       ]

     

       21
       30
        
       

     

       22
       31
        
       def tokenize_post(status: dict) -> list[cross.Token]:

     

       23
       32
        
           soup = BeautifulSoup(status['content'], "html.parser")

     
···

       78
       87
        
           return tokens

     

       79
       88
        
           

     

       80
       89
        
       class MastodonPost(cross.Post):

     

       81
       81
       -
           def __init__(self, status: dict) -> None:

     

       90
       90
       +
           def __init__(self, status: dict, media_attachments: list[media_util.MediaInfo]) -> None:

     

       82
       91
        
               super().__init__()

     

       83
       92
        
               self.status = status

     

       84
       84
       -
               media_attachments: list[cross.MediaAttachment] = []

     

       85
       85
       -
               

     

       86
       86
       -
               for attachment in status.get('media_attachments', []):

     

       87
       87
       -
                   media_attachments.append(MastodonAttachment(attachment))

     

       88
       88
       -
                   

     

       89
       93
        
               self.media_attachments = media_attachments

     

       90
       90
       -
               

     

       91
       94
        
               self.tokens = tokenize_post(status)

     

       92
       95
        
           

     

       93
       96
        
           def get_tokens(self) -> list[cross.Token]:

     
···

       114
       117
        
           def is_sensitive(self) -> bool:

     

       115
       118
        
               return self.status.get('sensitive', False)

     

       116
       119
        
           

     

       117
       117
       -
           def get_attachments(self) -> list[cross.MediaAttachment]:

     

       120
       120
       +
           def get_attachments(self) -> list[media_util.MediaInfo]:

     

       118
       121
        
               return self.media_attachments

     

       119
       122
        
       

     

       120
       120
       -
       class MastodonAttachment(cross.MediaAttachment):

     

       121
       121
       -
           def __init__(self, attachment: dict) -> None:

     

       122
       122
       -
               super().__init__()

     

       123
       123
       -
               self.attachment = attachment

     

       124
       124
       -
               

     

       125
       125
       -
               if attachment.get('type') == 'video' or attachment.get('type') == 'image':

     

       126
       126
       -
                   if attachment.get('meta') and attachment.get('meta', {}).get('original'):

     

       127
       127
       -
                       def from_status(bytes: bytes) -> cross.MediaMeta:

     

       128
       128
       -
                           o_meta = attachment.get('meta', {}).get('original')

     

       129
       129
       -
                           return cross.MediaMeta(o_meta['width'], o_meta['height'], o_meta.get('duration', -1))

     

       130
       130
       -
                       self.meta_generator = from_status

     

       131
       131
       -
                   else:

     

       132
       132
       -
                       def from_bytes(bytes: bytes) -> cross.MediaMeta:

     

       133
       133
       -
                           o_meta = media_util.get_media_meta(bytes)

     

       134
       134
       -
                           return cross.MediaMeta(o_meta['width'], o_meta['height'], o_meta.get('duration', -1))

     

       135
       135
       -
                       self.meta_generator = from_bytes

     

       136
       136
       -
           

     

       137
       137
       -
           # URL to download the attachment from

     

       138
       138
       -
           def get_url(self) -> str:

     

       139
       139
       -
               return self.attachment.get('url', '')

     

       140
       140
       -
           

     

       141
       141
       -
           # type of attachment

     

       142
       142
       -
           def get_type(self) -> str | None:

     

       143
       143
       -
               return FORMATS.get(self.attachment.get('type', 'other'), 'other')

     

       144
       144
       -
               

     

       145
       145
       -
           # create file metadata from bytes or other

     

       146
       146
       -
           def create_meta(self, bytes: bytes) -> cross.MediaMeta:

     

       147
       147
       -
               if self.meta_generator:

     

       148
       148
       -
                   return self.meta_generator(bytes)

     

       149
       149
       -
               return cross.MediaMeta(-1, -1, -1)

     

       150
       150
       -
           

     

       151
       151
       -
           # get media description

     

       152
       152
       -
           def get_alt(self) -> str:

     

       153
       153
       -
               return self.attachment.get('description') or ''

     

       154
       154
       -
       

     

       155
       123
        
       class MastodonInput(cross.Input):

     

       156
       124
        
           def __init__(self, settings: dict, db: DataBaseWorker) -> None:

     

       157
       125
        
               self.options = settings.get('options', {})

     
···

       210
       178
        
                   return

     

       211
       179
        
               

     

       212
       180
        
               LOGGER.info("Crossposting '%s'...", status['id'])

     

       213
       213
       -
               cross_post = MastodonPost(status)

     

       181
       181
       +
               

     

       182
       182
       +
               media_attachments: list[media_util.MediaInfo] = []

     

       183
       183
       +
               for attachment in status.get('media_attachments', []):

     

       184
       184
       +
                   LOGGER.info("Downloading %s...", attachment['url'])

     

       185
       185
       +
                   info = media_util.download_media(attachment['url'], attachment.get('description') or '')

     

       186
       186
       +
                   if not info:

     

       187
       187
       +
                       LOGGER.error("Skipping '%s'. Failed to download media!", status['id'])

     

       188
       188
       +
                       return

     

       189
       189
       +
                   media_attachments.append(info)

     

       190
       190
       +
               

     

       191
       191
       +
               cross_post = MastodonPost(status, media_attachments)

     

       214
       192
        
               for output in outputs:

     

       215
       193
        
                   output.accept_post(cross_post)

     

       216
       194
        
           

     
···

       292
       270
        
               media_config: dict = configuration.get('media_attachments', {})

     

       293
       271
        
               self.image_size_limit: int = media_config.get('image_size_limit', 16777216)

     

       294
       272
        
               self.video_size_limit: int = media_config.get('video_size_limit', 103809024)

     

       295
       295
       -
               self.supported_mime_types: list[str] = media_config.get('supported_mime_types', [

     

       296
       296
       -
                   'audio/ogg',

     

       297
       297
       -
                   'image/jpeg',

     

       298
       298
       -
                   'image/png',

     

       299
       299
       -
                   'video/mp4'

     

       300
       300
       -
               ])

     

       273
       273
       +
               self.supported_mime_types: list[str] = media_config.get('supported_mime_types', POSSIBLE_MIMES)

     

       301
       274
        
               

     

       302
       302
       -
               # *oma max post chars

     

       275
       275
       +
               # *oma: max post chars

     

       303
       276
        
               max_toot_chars = instance_info.get('max_toot_chars')

     

       304
       277
        
               if max_toot_chars:

     

       305
       278
        
                   self.max_characters: int = max_toot_chars

     

       306
       279
        
               

     

       307
       307
       -
               # *oma max upload limit

     

       280
       280
       +
               # *oma: max upload limit

     

       308
       281
        
               upload_limit = instance_info.get('upload_limit')

     

       309
       282
        
               if upload_limit:

     

       310
       283
        
                   self.image_size_limit: int = upload_limit

     

       311
       284
        
                   self.video_size_limit: int = upload_limit

     

       312
       285
        
               

     

       286
       286
       +
               # *oma ext: supported text types

     

       313
       287
        
               self.text_format = 'text/plain'

     

       314
       288
        
               pleroma = instance_info.get('pleroma')

     

       315
       289
        
               if pleroma:

     
···

       319
       293
        
                   elif 'text/markdown' in post_formats:

     

       320
       294
        
                       self.text_format = 'text/markdown'

     

       321
       295
        
           

     

       322
       322
       -
           def upload_media(self, attachments: list[cross.MediaAttachment]) -> list[str] | None:

     

       323
       323
       -
               prepare: list[tuple[str, str, bytes]] = []

     

       324
       324
       -
               

     

       325
       325
       -
               for attachment in attachments:

     

       326
       326
       -
                   alt = attachment.get_alt()

     

       327
       327
       -
                   mbytes: bytes | None

     

       296
       296
       +
           def upload_media(self, attachments: list[media_util.MediaInfo]) -> list[str] | None:

     

       297
       297
       +
               for a in attachments:

     

       298
       298
       +
                   if a.mime.startswith('image/') and len(a.io) > self.image_size_limit:

     

       299
       299
       +
                       return None

     

       328
       300
        
                   

     

       329
       329
       -
                   if attachment.get_type() == 'image':

     

       330
       330
       -
                       mbytes = media_util.download_blob(attachment.get_url(), self.image_size_limit)

     

       331
       331
       -
                   elif attachment.get_type() in {'video', 'gif'}:

     

       332
       332
       -
                       mbytes = media_util.download_blob(attachment.get_url(), self.video_size_limit)

     

       333
       333
       -
                   else:

     

       334
       334
       -
                       mbytes = media_util.download_blob(attachment.get_url(), 7_000_000)

     

       335
       335
       -
                       

     

       336
       336
       -
                   if not mbytes:

     

       301
       301
       +
                   if a.mime.startswith('video/') and len(a.io) > self.video_size_limit:

     

       337
       302
        
                       return None

     

       338
       303
        
                   

     

       339
       339
       -
                   filename = media_util.get_filename_from_url(attachment.get_url())

     

       340
       340
       -
                   LOGGER.info("Downloaded %s", filename)

     

       341
       341
       -
                   prepare.append((filename, alt, mbytes))

     

       304
       304
       +
                   if not a.mime.startswith('image/') and not a.mime.startswith('video/'):

     

       305
       305
       +
                       if len(a.io) > 7_000_000:

     

       306
       306
       +
                           return None

     

       342
       307
        
               

     

       343
       308
        
               uploads: list[dict] = []

     

       344
       344
       -
               

     

       345
       345
       -
               for name, desc, bbytes in prepare:

     

       346
       346
       -
                   mime_type = magic.Magic(mime=True).from_buffer(bbytes)

     

       347
       347
       -
                   if not mime_type:

     

       348
       348
       -
                       mime_type = 'application/octet-stream'

     

       349
       349
       -
                       

     

       350
       350
       -
                   files = {

     

       351
       351
       -
                       'file': (name, bbytes, mime_type)

     

       352
       352
       -
                   }

     

       309
       309
       +
               for a in attachments:

     

       353
       310
        
                   data = {}

     

       354
       354
       -
                   if desc:

     

       355
       355
       -
                       data['description'] = desc

     

       311
       311
       +
                   if a.alt:

     

       312
       312
       +
                       data['description'] = a.alt

     

       356
       313
        
                   

     

       357
       314
        
                   req = requests.post(f"{self.service}/api/v2/media", headers= {

     

       358
       315
        
                       'Authorization': f'Bearer {self.token}'

     

       359
       359
       -
                   }, files=files, data=data)

     

       316
       316
       +
                   }, files={'file': (a.name, a.io, a.mime)}, data=data)

     

       360
       317
        
                   

     

       361
       318
        
                   if req.status_code == 200:

     

       362
       362
       -
                       LOGGER.info("Uploaded %s! (%s)", name, req.json()['id'])

     

       319
       319
       +
                       LOGGER.info("Uploaded %s! (%s)", a.name, req.json()['id'])

     

       363
       320
        
                       uploads.append({

     

       364
       321
        
                           'done': True,

     

       365
       322
        
                           'id': req.json()['id']

     

       366
       323
        
                       })

     

       367
       324
        
                   elif req.status_code == 202:

     

       368
       368
       -
                       LOGGER.info("Waiting for %s to process!", name)

     

       325
       325
       +
                       LOGGER.info("Waiting for %s to process!", a.name)

     

       369
       326
        
                       uploads.append({

     

       370
       327
        
                           'done': False,

     

       371
       328
        
                           'id': req.json()['id']

     

       372
       329
        
                       })

     

       373
       330
        
                   else:

     

       374
       374
       -
                       LOGGER.error("Failes to download %s! %s", name, req.text)

     

       331
       331
       +
                       LOGGER.error("Failed to upload %s! %s", a.name, req.text)

     

       375
       332
        
                       req.raise_for_status()

     

       376
       333
        
               

     

       377
       334
        
               while any([not val['done'] for val in uploads]):

     
···

       416
       373
        
               

     

       417
       374
        
               return p_text

     

       418
       375
        
       

     

       419
       419
       -
           def split_tokens_media(self, tokens: list[cross.Token], media: list[cross.MediaAttachment]):

     

       420
       420
       -
               split_tokens = util.split_tokens(tokens, self.max_characters, self.characters_reserved_per_url)

     

       376
       376
       +
           def split_tokens_media(self, tokens: list[cross.Token], media: list[media_util.MediaInfo]):

     

       377
       377
       +
               split_tokens = cross.split_tokens(tokens, self.max_characters, self.characters_reserved_per_url)

     

       421
       378
        
               post_text: list[str] = []

     

       422
       379
        
               

     

       423
       380
        
               for block in split_tokens:

     
···

       460
       417
        
                       posts[idx]["attachments"].append(att)

     

       461
       418
        
                       current_image_post_idx = idx

     

       462
       419
        
               

     

       463
       463
       -
               result: list[tuple[str, list[cross.MediaAttachment]]] = []

     

       420
       420
       +
               result: list[tuple[str, list[media_util.MediaInfo]]] = []

     

       464
       421
        
               

     

       465
       422
        
               for p in posts:

     

       466
       423
        
                   result.append((p['text'], p["attachments"]))

+30 -1

media_util.py

···

       3
       3
        
       import json

     

       4
       4
        
       import re, urllib.parse, os

     

       5
       5
        
       from util import LOGGER

     

       6
       6
       +
       import magic

     

       6
       7
        
       

     

       7
       8
        
       FILENAME = re.compile(r'filename="?([^\";]*)"?')

     

       9
       9
       +
       MAGIC = magic.Magic(mime=True)

     

       10
       10
       +
       

     

       11
       11
       +
       class MediaInfo():

     

       12
       12
       +
           def __init__(self, url: str, name: str, mime: str, alt: str, io: bytes) -> None:

     

       13
       13
       +
               self.url = url

     

       14
       14
       +
               self.name = name

     

       15
       15
       +
               self.mime = mime

     

       16
       16
       +
               self.alt = alt

     

       17
       17
       +
               self.io = io

     

       18
       18
       +
       

     

       19
       19
       +
       def download_media(url: str, alt: str) -> MediaInfo | None:

     

       20
       20
       +
           name = get_filename_from_url(url)

     

       21
       21
       +
           io = download_blob(url, max_bytes=100_000_000)

     

       22
       22
       +
           if not io:

     

       23
       23
       +
               LOGGER.error("Failed to download media attachment! %s", url)

     

       24
       24
       +
               return None

     

       25
       25
       +
           mime = MAGIC.from_buffer(io)

     

       26
       26
       +
           if not mime:

     

       27
       27
       +
               mime = 'application/octet-stream'

     

       28
       28
       +
           return MediaInfo(url, name, mime, alt, io)

     

       8
       29
        
       

     

       9
       30
        
       def get_filename_from_url(url):

     

       10
       31
        
           try:

     
···

       18
       39
        
               pass

     

       19
       40
        
       

     

       20
       41
        
           parsed_url = urllib.parse.urlparse(url)

     

       21
       21
       -
           return os.path.basename(parsed_url.path)

     

       42
       42
       +
           base_name = os.path.basename(parsed_url.path)

     

       43
       43
       +
           

     

       44
       44
       +
           # hardcoded fix to return the cid for pds

     

       45
       45
       +
           if base_name == 'com.atproto.sync.getBlob':

     

       46
       46
       +
               qs = urllib.parse.parse_qs(parsed_url.query)

     

       47
       47
       +
               if qs and qs.get('cid'):

     

       48
       48
       +
                   return qs['cid'][0]

     

       49
       49
       +
       

     

       50
       50
       +
           return base_name

     

       22
       51
        
       

     

       23
       52
        
       def probe_bytes(bytes: bytes) -> dict:

     

       24
       53
        
           cmd = [

+15 -35

misskey.py

···

       103
       103
        
           return tokens

     

       104
       104
        
       

     

       105
       105
        
       class MisskeyPost(cross.Post):

     

       106
       106
       -
           def __init__(self, note: dict) -> None:

     

       106
       106
       +
           def __init__(self, note: dict, files: list[media_util.MediaInfo]) -> None:

     

       107
       107
        
               super().__init__()

     

       108
       108
        
               self.note = note

     

       109
       109
       -
               

     

       110
       110
       -
               media_attachments: list[cross.MediaAttachment] = []

     

       111
       111
       -
               

     

       112
       112
       -
               sensitive = False

     

       113
       113
       -
               for attachment in note.get('files', []):

     

       114
       114
       -
                   media_attachments.append(MisskeyAttachment(attachment))

     

       115
       115
       -
                   sensitive |= attachment.get('isSensitive', False)

     

       116
       116
       -
                   

     

       117
       117
       -
               self.sensitive = sensitive

     

       118
       118
       -
               self.media_attachments = media_attachments

     

       119
       119
       -
               

     

       109
       109
       +
               self.sensitive = any([a.get('isSensitive', False) for a in note.get('files', [])])

     

       110
       110
       +
               self.media_attachments = files

     

       120
       111
        
               self.tokens = tokenize_note(self.note)

     

       121
       112
        
           

     

       122
       113
        
           def get_tokens(self) -> list[cross.Token]:

     
···

       129
       120
        
               date = self.note.get('createdAt')

     

       130
       121
        
               return date or super().get_post_date_iso()

     

       131
       122
        
           

     

       132
       132
       -
           def get_attachments(self) -> list[cross.MediaAttachment]:

     

       123
       123
       +
           def get_attachments(self) -> list[media_util.MediaInfo]:

     

       133
       124
        
               return self.media_attachments

     

       134
       125
        
           

     

       135
       126
        
           def get_id(self) -> str:

     
···

       144
       135
        
           def is_sensitive(self) -> bool:

     

       145
       136
        
               return self.sensitive

     

       146
       137
        
       

     

       147
       147
       -
       class MisskeyAttachment(cross.MediaAttachment):

     

       148
       148
       -
           def __init__(self, attachment: dict) -> None:

     

       149
       149
       -
               super().__init__()

     

       150
       150
       -
               self.attachment = attachment

     

       151
       151
       -
           

     

       152
       152
       -
           def create_meta(self, bytes: bytes) -> cross.MediaMeta:

     

       153
       153
       -
               # it's nort worth it

     

       154
       154
       -
               if get_image_common(self.attachment['type']):

     

       155
       155
       -
                   o_meta = media_util.get_media_meta(bytes)

     

       156
       156
       -
                   return cross.MediaMeta(o_meta['width'], o_meta['height'], o_meta.get('duration', -1))

     

       157
       157
       -
               return cross.MediaMeta(-1, -1, -1)

     

       158
       158
       -
           

     

       159
       159
       -
           def get_url(self) -> str:

     

       160
       160
       -
               return self.attachment.get('url', '')

     

       161
       161
       -
           

     

       162
       162
       -
           def get_type(self) -> str | None:

     

       163
       163
       -
               return get_image_common(self.attachment['type'])

     

       164
       164
       -
           

     

       165
       165
       -
           def get_alt(self) -> str:

     

       166
       166
       -
               return self.attachment.get('comment') or ''

     

       167
       167
       -
       

     

       168
       138
        
       class MisskeyInput(cross.Input):

     

       169
       139
        
           def __init__(self, settings: dict, db: cross.DataBaseWorker) -> None:

     

       170
       140
        
               self.options = settings.get('options', {})

     
···

       210
       180
        
                   return

     

       211
       181
        
               

     

       212
       182
        
               LOGGER.info("Crossposting '%s'...", note['id'])

     

       213
       213
       -
               cross_post = MisskeyPost(note)

     

       183
       183
       +
               

     

       184
       184
       +
               media_attachments: list[media_util.MediaInfo] = []

     

       185
       185
       +
               for attachment in note.get('files', []):

     

       186
       186
       +
                   LOGGER.info("Downloading %s...", attachment['url'])

     

       187
       187
       +
                   info = media_util.download_media(attachment['url'], attachment.get('comment') or '')

     

       188
       188
       +
                   if not info:

     

       189
       189
       +
                       LOGGER.error("Skipping '%s'. Failed to download media!", note['id'])

     

       190
       190
       +
                       return

     

       191
       191
       +
                   media_attachments.append(info)

     

       192
       192
       +
               

     

       193
       193
       +
               cross_post = MisskeyPost(note, media_attachments)

     

       214
       194
        
               for output in outputs:

     

       215
       195
        
                   output.accept_post(cross_post)

     

       216
       196

-122

util.py

···

       1
       1
       -
       import re

     

       2
       2
       -
       import cross

     

       3
       1
        
       import logging, sys, os

     

       4
       2
        
       

     

       5
       3
        
       logging.basicConfig(stream=sys.stdout, level=logging.INFO)

     

       6
       4
        
       LOGGER = logging.getLogger("XPost")

     

       7
       7
       -
       

     

       8
       8
       -
       ALTERNATE = re.compile(r'\S+|\s+')

     

       9
       5
        
       

     

       10
       6
        
       def canonical_label(label: str | None, href: str):

     

       11
       7
        
           if not label or label == href:

     
···

       17
       13
        
                   return True

     

       18
       14
        
           

     

       19
       15
        
           return False

     

       20
       20
       -
       

     

       21
       21
       -
       def split_tokens(tokens: list[cross.Token], max_chars: int, max_link_len: int = 35) -> list[list[cross.Token]]:

     

       22
       22
       -
           def start_new_block():

     

       23
       23
       -
               nonlocal current_block, blocks, current_length

     

       24
       24
       -
               if current_block:

     

       25
       25
       -
                   blocks.append(current_block)

     

       26
       26
       -
               current_block = []

     

       27
       27
       -
               current_length = 0

     

       28
       28
       -
       

     

       29
       29
       -
           def append_text_to_block(text_segment):

     

       30
       30
       -
               nonlocal current_block

     

       31
       31
       -
               # if the last element in the current block is also text, just append to it

     

       32
       32
       -
               if current_block and isinstance(current_block[-1], cross.TextToken):

     

       33
       33
       -
                   current_block[-1].text += text_segment

     

       34
       34
       -
               else:

     

       35
       35
       -
                   current_block.append(cross.TextToken(text_segment))

     

       36
       36
       -
           

     

       37
       37
       -
           blocks: list[list[cross.Token]] = []

     

       38
       38
       -
           current_block: list[cross.Token] = []

     

       39
       39
       -
           current_length: int = 0

     

       40
       40
       -
       

     

       41
       41
       -
           for token in tokens:

     

       42
       42
       -
               if isinstance(token, cross.TextToken):

     

       43
       43
       -
                   # split content into alternating “words” (\S+) and “whitespace” (\s+).

     

       44
       44
       -
                   # this ensures every space/newline is treated as its own segment.

     

       45
       45
       -
                   segments: list[str] = ALTERNATE.findall(token.text)

     

       46
       46
       -
       

     

       47
       47
       -
                   for seg in segments:

     

       48
       48
       -
                       if seg.isspace():

     

       49
       49
       -
                           # whitespace segment: we count it, and if it doesn't fully fit,

     

       50
       50
       -
                           # split the whitespace across blocks to preserve exact spacing.

     

       51
       51
       -
                           seg_len: int = len(seg)

     

       52
       52
       -
                           while seg_len > 0:

     

       53
       53
       -
                               space_left = max_chars - current_length

     

       54
       54
       -
                               if space_left == 0:

     

       55
       55
       -
                                   start_new_block()

     

       56
       56
       -
                                   continue

     

       57
       57
       -
       

     

       58
       58
       -
                               take = min(space_left, seg_len)

     

       59
       59
       -
                               part = seg[:take]

     

       60
       60
       -
                               append_text_to_block(part)

     

       61
       61
       -
       

     

       62
       62
       -
                               current_length += len(part)

     

       63
       63
       -
                               seg = seg[take:]

     

       64
       64
       -
                               seg_len -= take

     

       65
       65
       -
       

     

       66
       66
       -
                               if current_length == max_chars:

     

       67
       67
       -
                                   start_new_block()

     

       68
       68
       -
       

     

       69
       69
       -
                       else:

     

       70
       70
       -
                           # seg is a “word” (no whitespace inside).

     

       71
       71
       -
                           word: str = seg

     

       72
       72
       -
                           wlen: int = len(word)

     

       73
       73
       -
       

     

       74
       74
       -
                           # if the word itself is longer than n, we must split it with hyphens.

     

       75
       75
       -
                           if wlen > max_chars:

     

       76
       76
       -
                               # first, if we're in the middle of a block, close it & start fresh.

     

       77
       77
       -
                               if current_length > 0:

     

       78
       78
       -
                                   start_new_block()

     

       79
       79
       -
       

     

       80
       80
       -
                               remaining = word

     

       81
       81
       -
                               # carve off (n-1)-sized chunks + “-” so each chunk is n chars.

     

       82
       82
       -
                               while len(remaining) > (max_chars - 1):

     

       83
       83
       -
                                   chunk = remaining[: max_chars - 1] + '-'

     

       84
       84
       -
                                   append_text_to_block(chunk)

     

       85
       85
       -
                                   # that chunk fills the current block

     

       86
       86
       -
                                   start_new_block()

     

       87
       87
       -
                                   remaining = remaining[max_chars - 1 :]

     

       88
       88
       -
       

     

       89
       89
       -
                               # now whatever remains is ≤ n characters

     

       90
       90
       -
                               if remaining:

     

       91
       91
       -
                                   append_text_to_block(remaining)

     

       92
       92
       -
                                   current_length = len(remaining)

     

       93
       93
       -
       

     

       94
       94
       -
                           else:

     

       95
       95
       -
                               # word fits fully within a block (≤ n).

     

       96
       96
       -
                               if current_length + wlen <= max_chars:

     

       97
       97
       -
                                   append_text_to_block(word)

     

       98
       98
       -
                                   current_length += wlen

     

       99
       99
       -
                               else:

     

       100
       100
       -
                                   # not enough space in current block → start a new one

     

       101
       101
       -
                                   start_new_block()

     

       102
       102
       -
                                   append_text_to_block(word)

     

       103
       103
       -
                                   current_length = wlen

     

       104
       104
       -
       

     

       105
       105
       -
               elif isinstance(token, cross.LinkToken):

     

       106
       106
       -
                   link_len = len(token.label)

     

       107
       107
       -
                   if canonical_label(token.label, token.href):

     

       108
       108
       -
                       link_len = min(link_len, max_link_len)

     

       109
       109
       -
       

     

       110
       110
       -
                   if current_length + link_len <= max_chars:

     

       111
       111
       -
                       current_block.append(token)

     

       112
       112
       -
                       current_length += link_len

     

       113
       113
       -
                   else:

     

       114
       114
       -
                       start_new_block()

     

       115
       115
       -
                       current_block.append(token)

     

       116
       116
       -
                       current_length = link_len

     

       117
       117
       -
       

     

       118
       118
       -
               elif isinstance(token, cross.TagToken):

     

       119
       119
       -
                   # we treat a hashtag like “#tagname” for counting.

     

       120
       120
       -
                   hashtag_len = 1 + len(token.tag)

     

       121
       121
       -
                   if current_length + hashtag_len <= max_chars:

     

       122
       122
       -
                       current_block.append(token)

     

       123
       123
       -
                       current_length += hashtag_len

     

       124
       124
       -
                   else:

     

       125
       125
       -
                       start_new_block()

     

       126
       126
       -
                       current_block.append(token)

     

       127
       127
       -
                       current_length = hashtag_len

     

       128
       128
       -
       

     

       129
       129
       -
               else:

     

       130
       130
       -
                   # if you happen to have other types, just append them without affecting length.

     

       131
       131
       -
                   current_block.append(token)

     

       132
       132
       -
       

     

       133
       133
       -
           # append any remaining tokens as the final block

     

       134
       134
       -
           if current_block:

     

       135
       135
       -
               blocks.append(current_block)

     

       136
       136
       -
       

     

       137
       137
       -
           return blocks

     

       138
       16
        
       

     

       139
       17
        
       def safe_get(obj: dict, key: str, default):

     

       140
       18
        
           val = obj.get(key, default)