import re from markdownify import markdownify as md import cross import logging, sys, os logging.basicConfig(stream=sys.stdout, level=logging.INFO) LOGGER = logging.getLogger("XPost") ALTERNATE = re.compile(r'\S+|\s+') DEFAULT_SETTINGS: dict = { 'bluesky': { 'quote_gate': False, 'thread_gate': [ 'everybody' ] } } def tokenize_html(content: str): return tokenize_markdown(md(content, autolinks=False)) def tokenize_markdown(md) -> list[cross.Token]: tokens = [] i = 0 length = len(md) while i < length: if md[i] == '!' and i + 1 < length and md[i + 1] == '[': # media start = i i += 2 alt_text = '' while i < length and md[i] != ']': alt_text += md[i] i += 1 i += 1 # skip '] if i < length and md[i] == '(': i += 1 url = '' while i < length and md[i] != ')': url += md[i] i += 1 i += 1 # skip ) #tokens.append({'type': 'media', 'alt': alt_text, 'url': url}) else: tokens.append(cross.TextToken(md[start:i])) elif md[i] == '[': # link or special start = i i += 1 link_text = '' while i < length and md[i] != ']': link_text += md[i] i += 1 i += 1 # skip ] if i < length and md[i] == '(': i += 1 url = '' while i < length and md[i] != ')': url += md[i] i += 1 i += 1 # skip ) if link_text.startswith('#'): tokens.append(cross.TagToken(link_text[1:])) elif link_text.startswith('@'): tokens.append(cross.MentionToken(link_text[1:], url)) elif link_text.startswith('http://') or link_text.startswith('https://'): tokens.append(cross.LinkToken(url, link_text)) else: tokens.append(cross.LinkToken(url, link_text)) else: tokens.append(cross.TextToken(md[start:i])) else: # plain text start = i while i < length and md[i] != '[' and not (md[i] == '!' and i + 1 < length and md[i + 1] == '['): i += 1 tokens.append(cross.TextToken(md[start:i])) return tokens def split_tokens(tokens: list[cross.Token], max_chars: int) -> list[list[cross.Token]]: def start_new_block(): nonlocal current_block, blocks, current_length if current_block: blocks.append(current_block) current_block = [] current_length = 0 def append_text_to_block(text_segment): nonlocal current_block # if the last element in the current block is also text, just append to it if current_block and isinstance(current_block[-1], cross.TextToken): current_block[-1].text += text_segment else: current_block.append(cross.TextToken(text_segment)) blocks: list[list[cross.Token]] = [] current_block: list[cross.Token] = [] current_length: int = 0 for token in tokens: if isinstance(token, cross.TextToken): # split content into alternating “words” (\S+) and “whitespace” (\s+). # this ensures every space/newline is treated as its own segment. segments: list[str] = ALTERNATE.findall(token.text) for seg in segments: if seg.isspace(): # whitespace segment: we count it, and if it doesn't fully fit, # split the whitespace across blocks to preserve exact spacing. seg_len: int = len(seg) while seg_len > 0: space_left = max_chars - current_length if space_left == 0: start_new_block() continue take = min(space_left, seg_len) part = seg[:take] append_text_to_block(part) current_length += len(part) seg = seg[take:] seg_len -= take if current_length == max_chars: start_new_block() else: # seg is a “word” (no whitespace inside). word: str = seg wlen: int = len(word) # if the word itself is longer than n, we must split it with hyphens. if wlen > max_chars: # first, if we're in the middle of a block, close it & start fresh. if current_length > 0: start_new_block() remaining = word # carve off (n-1)-sized chunks + “-” so each chunk is n chars. while len(remaining) > (max_chars - 1): chunk = remaining[: max_chars - 1] + '-' append_text_to_block(chunk) # that chunk fills the current block start_new_block() remaining = remaining[max_chars - 1 :] # now whatever remains is ≤ n characters if remaining: append_text_to_block(remaining) current_length = len(remaining) else: # word fits fully within a block (≤ n). if current_length + wlen <= max_chars: append_text_to_block(word) current_length += wlen else: # not enough space in current block → start a new one start_new_block() append_text_to_block(word) current_length = wlen elif isinstance(token, cross.LinkToken): link_len = min(len(token.label), 35) if current_length + link_len <= max_chars: current_block.append(token) current_length += link_len else: start_new_block() current_block.append(token) current_length = link_len elif isinstance(token, cross.TagToken): # we treat a hashtag like “#tagname” for counting. hashtag_len = 1 + len(token.tag) if current_length + hashtag_len <= max_chars: current_block.append(token) current_length += hashtag_len else: start_new_block() current_block.append(token) current_length = hashtag_len else: # if you happen to have other types, just append them without affecting length. current_block.append(token) # append any remaining tokens as the final block if current_block: blocks.append(current_block) return blocks def safe_get(obj: dict, key: str, default): val = obj.get(key, default) return val if val else default def value_or_envvar(text: str) -> str: if text.startswith('env:'): return os.environ.get(text[4:], '') return text def get_or_envvar(obj: dict, key: str): return value_or_envvar(obj.get(key, ''))