util.py at afeeef7ab59c26df88bef6d008d5ef2d4d657a60 · zenfyr.dev/xpost

zenfyr.dev / xpost
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
xpost / util.py
at afeeef7ab59c26df88bef6d008d5ef2d4d657a60 7.7 kB view raw
  1import re
  2from markdownify import markdownify as md
  3import cross
  4import logging, sys, os
  5
  6logging.basicConfig(stream=sys.stdout, level=logging.INFO)
  7LOGGER = logging.getLogger("XPost")
  8
  9ALTERNATE = re.compile(r'\S+|\s+')
 10
 11DEFAULT_SETTINGS: dict = {
 12    'bluesky': {
 13        'quote_gate': False,
 14        'thread_gate': [
 15            'everybody'
 16        ]
 17    }
 18}
 19
 20def tokenize_html(content: str):
 21    return tokenize_markdown(md(content, autolinks=False))
 22
 23def tokenize_markdown(md) -> list[cross.Token]:
 24    tokens = []
 25    i = 0
 26    length = len(md)
 27
 28    while i < length:
 29        if md[i] == '!' and i + 1 < length and md[i + 1] == '[':
 30            # media
 31            start = i
 32            i += 2
 33            alt_text = ''
 34            while i < length and md[i] != ']':
 35                alt_text += md[i]
 36                i += 1
 37            i += 1  # skip ']
 38            if i < length and md[i] == '(':
 39                i += 1
 40                url = ''
 41                while i < length and md[i] != ')':
 42                    url += md[i]
 43                    i += 1
 44                i += 1  # skip )
 45                #tokens.append({'type': 'media', 'alt': alt_text, 'url': url})
 46            else:
 47                tokens.append(cross.TextToken(md[start:i]))
 48        elif md[i] == '[':
 49            # link or special
 50            start = i
 51            i += 1
 52            link_text = ''
 53            while i < length and md[i] != ']':
 54                link_text += md[i]
 55                i += 1
 56            i += 1  # skip ]
 57            if i < length and md[i] == '(':
 58                i += 1
 59                url = ''
 60                while i < length and md[i] != ')':
 61                    url += md[i]
 62                    i += 1
 63                i += 1  # skip )
 64                if link_text.startswith('#'):
 65                    tokens.append(cross.TagToken(link_text[1:]))
 66                elif link_text.startswith('@'):
 67                    tokens.append(cross.MentionToken(link_text[1:], url))
 68                elif link_text.startswith('http://') or link_text.startswith('https://'):
 69                    tokens.append(cross.LinkToken(url, link_text))
 70                else:
 71                    tokens.append(cross.LinkToken(url, link_text))
 72            else:
 73                tokens.append(cross.TextToken(md[start:i]))
 74        else:
 75            # plain text
 76            start = i
 77            while i < length and md[i] != '[' and not (md[i] == '!' and i + 1 < length and md[i + 1] == '['):
 78                i += 1
 79            tokens.append(cross.TextToken(md[start:i]))
 80    return tokens
 81
 82
 83def split_tokens(tokens: list[cross.Token], max_chars: int) -> list[list[cross.Token]]:
 84    def start_new_block():
 85        nonlocal current_block, blocks, current_length
 86        if current_block:
 87            blocks.append(current_block)
 88        current_block = []
 89        current_length = 0
 90
 91    def append_text_to_block(text_segment):
 92        nonlocal current_block
 93        # if the last element in the current block is also text, just append to it
 94        if current_block and isinstance(current_block[-1], cross.TextToken):
 95            current_block[-1].text += text_segment
 96        else:
 97            current_block.append(cross.TextToken(text_segment))
 98    
 99    blocks: list[list[cross.Token]] = []
100    current_block: list[cross.Token] = []
101    current_length: int = 0
102
103    for token in tokens:
104        if isinstance(token, cross.TextToken):
105            # split content into alternating “words” (\S+) and “whitespace” (\s+).
106            # this ensures every space/newline is treated as its own segment.
107            segments: list[str] = ALTERNATE.findall(token.text)
108
109            for seg in segments:
110                if seg.isspace():
111                    # whitespace segment: we count it, and if it doesn't fully fit,
112                    # split the whitespace across blocks to preserve exact spacing.
113                    seg_len: int = len(seg)
114                    while seg_len > 0:
115                        space_left = max_chars - current_length
116                        if space_left == 0:
117                            start_new_block()
118                            continue
119
120                        take = min(space_left, seg_len)
121                        part = seg[:take]
122                        append_text_to_block(part)
123
124                        current_length += len(part)
125                        seg = seg[take:]
126                        seg_len -= take
127
128                        if current_length == max_chars:
129                            start_new_block()
130
131                else:
132                    # seg is a “word” (no whitespace inside).
133                    word: str = seg
134                    wlen: int = len(word)
135
136                    # if the word itself is longer than n, we must split it with hyphens.
137                    if wlen > max_chars:
138                        # first, if we're in the middle of a block, close it & start fresh.
139                        if current_length > 0:
140                            start_new_block()
141
142                        remaining = word
143                        # carve off (n-1)-sized chunks + “-” so each chunk is n chars.
144                        while len(remaining) > (max_chars - 1):
145                            chunk = remaining[: max_chars - 1] + '-'
146                            append_text_to_block(chunk)
147                            # that chunk fills the current block
148                            start_new_block()
149                            remaining = remaining[max_chars - 1 :]
150
151                        # now whatever remains is ≤ n characters
152                        if remaining:
153                            append_text_to_block(remaining)
154                            current_length = len(remaining)
155
156                    else:
157                        # word fits fully within a block (≤ n).
158                        if current_length + wlen <= max_chars:
159                            append_text_to_block(word)
160                            current_length += wlen
161                        else:
162                            # not enough space in current block → start a new one
163                            start_new_block()
164                            append_text_to_block(word)
165                            current_length = wlen
166
167        elif isinstance(token, cross.LinkToken):
168            link_len = min(len(token.label), 35)
169
170            if current_length + link_len <= max_chars:
171                current_block.append(token)
172                current_length += link_len
173            else:
174                start_new_block()
175                current_block.append(token)
176                current_length = link_len
177
178        elif isinstance(token, cross.TagToken):
179            # we treat a hashtag like “#tagname” for counting.
180            hashtag_len = 1 + len(token.tag)
181            if current_length + hashtag_len <= max_chars:
182                current_block.append(token)
183                current_length += hashtag_len
184            else:
185                start_new_block()
186                current_block.append(token)
187                current_length = hashtag_len
188
189        else:
190            # if you happen to have other types, just append them without affecting length.
191            current_block.append(token)
192
193    # append any remaining tokens as the final block
194    if current_block:
195        blocks.append(current_block)
196
197    return blocks
198
199def safe_get(obj: dict, key: str, default):
200    val = obj.get(key, default)
201    return val if val else default
202
203def value_or_envvar(text: str) -> str:
204    if text.startswith('env:'):
205        return os.environ.get(text[4:], '')
206    return text
207
208def get_or_envvar(obj: dict, key: str):
209    return value_or_envvar(obj.get(key, ''))