···
1
+
"""Zulip bot for automatically posting thicket feed updates."""
8
+
from datetime import datetime
9
+
from pathlib import Path
10
+
from typing import Any, Dict, List, Optional, Set, Tuple
12
+
from zulip_bots.lib import BotHandler
14
+
# Handle imports for both direct execution and package import
16
+
from ..core.git_store import GitStore
17
+
from ..models import AtomEntry, ThicketConfig
18
+
from ..cli.commands.sync import sync_feed
20
+
# When run directly by zulip-bots, add the package to path
22
+
src_dir = Path(__file__).parent.parent.parent
23
+
if str(src_dir) not in sys.path:
24
+
sys.path.insert(0, str(src_dir))
26
+
from thicket.core.git_store import GitStore
27
+
from thicket.models import AtomEntry, ThicketConfig
28
+
from thicket.cli.commands.sync import sync_feed
31
+
class ThicketBotHandler:
32
+
"""Zulip bot that monitors thicket feeds and posts new articles."""
34
+
def __init__(self) -> None:
35
+
"""Initialize the thicket bot."""
36
+
self.logger = logging.getLogger(__name__)
37
+
self.git_store: Optional[GitStore] = None
38
+
self.config: Optional[ThicketConfig] = None
39
+
self.posted_entries: Set[str] = set()
41
+
# Bot configuration from storage
42
+
self.stream_name: Optional[str] = None
43
+
self.topic_name: Optional[str] = None
44
+
self.sync_interval: int = 300 # 5 minutes default
45
+
self.max_entries_per_sync: int = 10
46
+
self.config_path: Optional[Path] = None
48
+
# Debug mode configuration
49
+
self.debug_user: Optional[str] = None
50
+
self.debug_zulip_user_id: Optional[str] = None
52
+
def usage(self) -> str:
53
+
"""Return bot usage instructions."""
55
+
**Thicket Feed Bot**
57
+
This bot automatically monitors thicket feeds and posts new articles.
60
+
- `@mention status` - Show current bot status and configuration
61
+
- `@mention sync now` - Force an immediate sync
62
+
- `@mention reset` - Clear posting history (will repost recent entries)
63
+
- `@mention config stream <stream_name>` - Set target stream
64
+
- `@mention config topic <topic_name>` - Set target topic
65
+
- `@mention config interval <seconds>` - Set sync interval
66
+
- `@mention help` - Show this help message
69
+
def initialize(self, bot_handler: BotHandler) -> None:
70
+
"""Initialize the bot with persistent storage."""
71
+
self.logger.info("Initializing ThicketBot")
73
+
# Get configuration from environment (set by CLI)
74
+
self.debug_user = os.getenv("THICKET_DEBUG_USER")
75
+
config_path_env = os.getenv("THICKET_CONFIG_PATH")
77
+
self.config_path = Path(config_path_env)
78
+
self.logger.info(f"Using thicket config: {self.config_path}")
80
+
# Load bot configuration from persistent storage
81
+
self._load_bot_config(bot_handler)
83
+
# Initialize thicket components
84
+
if self.config_path:
86
+
self._initialize_thicket()
87
+
self._load_posted_entries(bot_handler)
89
+
# Validate debug mode if enabled
91
+
self._validate_debug_mode(bot_handler)
93
+
except Exception as e:
94
+
self.logger.error(f"Failed to initialize thicket: {e}")
96
+
# Start background sync loop
97
+
self._schedule_sync(bot_handler)
99
+
def handle_message(self, message: Dict[str, Any], bot_handler: BotHandler) -> None:
100
+
"""Handle incoming Zulip messages."""
101
+
content = message["content"].strip()
102
+
sender = message["sender_full_name"]
104
+
# Only respond to mentions
105
+
if not self._is_mentioned(content, bot_handler):
109
+
cleaned_content = self._clean_mention(content, bot_handler)
110
+
command_parts = cleaned_content.split()
112
+
if not command_parts:
113
+
self._send_help(message, bot_handler)
116
+
command = command_parts[0].lower()
119
+
if command == "help":
120
+
self._send_help(message, bot_handler)
121
+
elif command == "status":
122
+
self._send_status(message, bot_handler, sender)
123
+
elif command == "sync" and len(command_parts) > 1 and command_parts[1] == "now":
124
+
self._handle_force_sync(message, bot_handler, sender)
125
+
elif command == "reset":
126
+
self._handle_reset_command(message, bot_handler, sender)
127
+
elif command == "config":
128
+
self._handle_config_command(message, bot_handler, command_parts[1:], sender)
130
+
bot_handler.send_reply(message, f"Unknown command: {command}. Type `@mention help` for usage.")
131
+
except Exception as e:
132
+
self.logger.error(f"Error handling command '{command}': {e}")
133
+
bot_handler.send_reply(message, f"Error processing command: {str(e)}")
135
+
def _is_mentioned(self, content: str, bot_handler: BotHandler) -> bool:
136
+
"""Check if the bot is mentioned in the message."""
138
+
# Get bot's actual name from Zulip
139
+
bot_info = bot_handler._client.get_profile()
140
+
if bot_info.get('result') == 'success':
141
+
bot_name = bot_info.get('full_name', '').lower()
143
+
return f"@{bot_name}" in content.lower() or f"@**{bot_name}**" in content.lower()
144
+
except Exception as e:
145
+
self.logger.debug(f"Could not get bot profile: {e}")
147
+
# Fallback to generic check
148
+
return "@thicket" in content.lower()
150
+
def _clean_mention(self, content: str, bot_handler: BotHandler) -> str:
151
+
"""Remove bot mention from message content."""
155
+
# Get bot's actual name from Zulip
156
+
bot_info = bot_handler._client.get_profile()
157
+
if bot_info.get('result') == 'success':
158
+
bot_name = bot_info.get('full_name', '')
160
+
# Remove @bot_name or @**bot_name**
161
+
escaped_name = re.escape(bot_name)
162
+
content = re.sub(rf'@(?:\*\*)?{escaped_name}(?:\*\*)?', '', content, flags=re.IGNORECASE).strip()
164
+
except Exception as e:
165
+
self.logger.debug(f"Could not get bot profile for mention cleaning: {e}")
167
+
# Fallback to removing @thicket
168
+
content = re.sub(r'@(?:\*\*)?thicket(?:\*\*)?', '', content, flags=re.IGNORECASE).strip()
171
+
def _send_help(self, message: Dict[str, Any], bot_handler: BotHandler) -> None:
172
+
"""Send help message."""
173
+
bot_handler.send_reply(message, self.usage())
175
+
def _send_status(self, message: Dict[str, Any], bot_handler: BotHandler, sender: str) -> None:
176
+
"""Send bot status information."""
178
+
f"**Thicket Bot Status** (requested by {sender})",
182
+
# Debug mode status
183
+
if self.debug_user:
184
+
status_lines.extend([
185
+
f"๐ **Debug Mode:** ENABLED",
186
+
f"๐ฏ **Debug User:** {self.debug_user}",
190
+
status_lines.extend([
191
+
f"๐ **Stream:** {self.stream_name or 'Not configured'}",
192
+
f"๐ **Topic:** {self.topic_name or 'Not configured'}",
196
+
status_lines.extend([
197
+
f"โฑ๏ธ **Sync Interval:** {self.sync_interval}s ({self.sync_interval // 60}m {self.sync_interval % 60}s)",
198
+
f"๐ **Max Entries/Sync:** {self.max_entries_per_sync}",
199
+
f"๐ **Config Path:** {self.config_path or 'Not configured'}",
201
+
f"๐ **Tracked Entries:** {len(self.posted_entries)}",
202
+
f"๐ **Catchup Mode:** {'Active (first run)' if len(self.posted_entries) == 0 else 'Inactive'}",
203
+
f"โ
**Thicket Initialized:** {'Yes' if self.git_store else 'No'}",
206
+
bot_handler.send_reply(message, "\n".join(status_lines))
208
+
def _handle_force_sync(self, message: Dict[str, Any], bot_handler: BotHandler, sender: str) -> None:
209
+
"""Handle immediate sync request."""
210
+
if not self._check_initialization(message, bot_handler):
213
+
bot_handler.send_reply(message, f"๐ Starting immediate sync... (requested by {sender})")
216
+
new_entries = self._perform_sync(bot_handler)
217
+
bot_handler.send_reply(
219
+
f"โ
Sync completed! Found {len(new_entries)} new entries."
221
+
except Exception as e:
222
+
self.logger.error(f"Force sync failed: {e}")
223
+
bot_handler.send_reply(message, f"โ Sync failed: {str(e)}")
225
+
def _handle_reset_command(self, message: Dict[str, Any], bot_handler: BotHandler, sender: str) -> None:
226
+
"""Handle reset command to clear posted entries tracking."""
228
+
self.posted_entries.clear()
229
+
self._save_posted_entries(bot_handler)
230
+
bot_handler.send_reply(
232
+
f"โ
Posting history reset! Recent entries will be posted on next sync. (requested by {sender})"
234
+
self.logger.info(f"Posted entries tracking reset by {sender}")
235
+
except Exception as e:
236
+
self.logger.error(f"Reset failed: {e}")
237
+
bot_handler.send_reply(message, f"โ Reset failed: {str(e)}")
239
+
def _handle_config_command(
241
+
message: Dict[str, Any],
242
+
bot_handler: BotHandler,
246
+
"""Handle configuration commands."""
248
+
bot_handler.send_reply(message, "Usage: `@mention config <setting> <value>`")
251
+
setting = args[0].lower()
252
+
value = " ".join(args[1:])
254
+
if setting == "stream":
255
+
self.stream_name = value
256
+
self._save_bot_config(bot_handler)
257
+
bot_handler.send_reply(message, f"โ
Stream set to: **{value}** (by {sender})")
259
+
elif setting == "topic":
260
+
self.topic_name = value
261
+
self._save_bot_config(bot_handler)
262
+
bot_handler.send_reply(message, f"โ
Topic set to: **{value}** (by {sender})")
264
+
elif setting == "interval":
266
+
interval = int(value)
268
+
bot_handler.send_reply(message, "โ Interval must be at least 60 seconds")
270
+
self.sync_interval = interval
271
+
self._save_bot_config(bot_handler)
272
+
bot_handler.send_reply(message, f"โ
Sync interval set to: **{interval}s** (by {sender})")
274
+
bot_handler.send_reply(message, "โ Invalid interval value. Must be a number of seconds.")
277
+
bot_handler.send_reply(
279
+
f"โ Unknown setting: {setting}. Available: stream, topic, interval"
282
+
def _load_bot_config(self, bot_handler: BotHandler) -> None:
283
+
"""Load bot configuration from persistent storage."""
285
+
config_data = bot_handler.storage.get("bot_config")
287
+
config = json.loads(config_data)
288
+
self.stream_name = config.get("stream_name")
289
+
self.topic_name = config.get("topic_name")
290
+
self.sync_interval = config.get("sync_interval", 300)
291
+
self.max_entries_per_sync = config.get("max_entries_per_sync", 10)
292
+
except Exception as e:
293
+
# Bot config not found on first run is expected
296
+
def _save_bot_config(self, bot_handler: BotHandler) -> None:
297
+
"""Save bot configuration to persistent storage."""
300
+
"stream_name": self.stream_name,
301
+
"topic_name": self.topic_name,
302
+
"sync_interval": self.sync_interval,
303
+
"max_entries_per_sync": self.max_entries_per_sync,
305
+
bot_handler.storage.put("bot_config", json.dumps(config_data))
306
+
except Exception as e:
307
+
self.logger.error(f"Error saving bot config: {e}")
309
+
def _initialize_thicket(self) -> None:
310
+
"""Initialize thicket components."""
311
+
if not self.config_path or not self.config_path.exists():
312
+
raise ValueError("Thicket config file not found")
314
+
# Load thicket configuration
316
+
with open(self.config_path) as f:
317
+
config_data = yaml.safe_load(f)
318
+
self.config = ThicketConfig(**config_data)
320
+
# Initialize git store
321
+
self.git_store = GitStore(self.config.git_store)
323
+
self.logger.info("Thicket components initialized successfully")
325
+
def _validate_debug_mode(self, bot_handler: BotHandler) -> None:
326
+
"""Validate debug mode configuration."""
327
+
if not self.debug_user or not self.git_store:
330
+
# Get current Zulip server from environment
331
+
zulip_site_url = os.getenv("THICKET_ZULIP_SITE_URL", "")
332
+
server_url = zulip_site_url.replace("https://", "").replace("http://", "")
334
+
# Check if debug user exists in thicket
335
+
user = self.git_store.get_user(self.debug_user)
337
+
raise ValueError(f"Debug user '{self.debug_user}' not found in thicket")
339
+
# Check if user has Zulip association for this server
341
+
raise ValueError("Could not determine Zulip server URL")
343
+
zulip_user_id = user.get_zulip_mention(server_url)
344
+
if not zulip_user_id:
345
+
raise ValueError(f"User '{self.debug_user}' has no Zulip association for server '{server_url}'")
347
+
# Try to look up the actual Zulip user ID from the email address
348
+
# But don't fail if we can't - we'll try again when sending messages
349
+
actual_user_id = self._lookup_zulip_user_id(bot_handler, zulip_user_id)
350
+
if actual_user_id and actual_user_id != zulip_user_id:
351
+
# Successfully resolved to numeric ID
352
+
self.debug_zulip_user_id = actual_user_id
353
+
self.logger.info(f"Debug mode enabled: Will send DMs to {self.debug_user} (email: {zulip_user_id}, user_id: {actual_user_id}) on {server_url}")
355
+
# Keep the email address, will resolve later when sending
356
+
self.debug_zulip_user_id = zulip_user_id
357
+
self.logger.info(f"Debug mode enabled: Will send DMs to {self.debug_user} ({zulip_user_id}) on {server_url} (will resolve user ID when sending)")
359
+
def _lookup_zulip_user_id(self, bot_handler: BotHandler, email_or_id: str) -> Optional[str]:
360
+
"""Look up Zulip user ID from email address or return the ID if it's already numeric."""
361
+
# If it's already a numeric user ID, return it
362
+
if email_or_id.isdigit():
366
+
client = bot_handler._client
368
+
self.logger.error("No Zulip client available for user lookup")
371
+
# First try the get_user_by_email API if available
373
+
user_result = client.get_user_by_email(email_or_id)
374
+
if user_result.get('result') == 'success':
375
+
user_data = user_result.get('user', {})
376
+
user_id = user_data.get('user_id')
378
+
self.logger.info(f"Found user ID {user_id} for '{email_or_id}' via get_user_by_email API")
379
+
return str(user_id)
380
+
except (AttributeError, Exception):
383
+
# Fallback: Get all users and search through them
384
+
users_result = client.get_users()
385
+
if users_result.get('result') == 'success':
386
+
for user in users_result['members']:
387
+
user_email = user.get('email', '')
388
+
delivery_email = user.get('delivery_email', '')
390
+
if (user_email == email_or_id or
391
+
delivery_email == email_or_id or
392
+
str(user.get('user_id')) == email_or_id):
393
+
user_id = user.get('user_id')
394
+
return str(user_id)
396
+
self.logger.error(f"No user found with identifier '{email_or_id}'. Searched {len(users_result['members'])} users.")
399
+
self.logger.error(f"Failed to get users: {users_result.get('msg', 'Unknown error')}")
402
+
except Exception as e:
403
+
self.logger.error(f"Error looking up user ID for '{email_or_id}': {e}")
406
+
def _lookup_zulip_user_info(self, bot_handler: BotHandler, email_or_id: str) -> Tuple[Optional[str], Optional[str]]:
407
+
"""Look up both Zulip user ID and full name from email address."""
408
+
if email_or_id.isdigit():
409
+
return email_or_id, None
412
+
client = bot_handler._client
416
+
# Try get_user_by_email API first
418
+
user_result = client.get_user_by_email(email_or_id)
419
+
if user_result.get('result') == 'success':
420
+
user_data = user_result.get('user', {})
421
+
user_id = user_data.get('user_id')
422
+
full_name = user_data.get('full_name', '')
424
+
return str(user_id), full_name
425
+
except AttributeError:
428
+
# Fallback: search all users
429
+
users_result = client.get_users()
430
+
if users_result.get('result') == 'success':
431
+
for user in users_result['members']:
432
+
if (user.get('email') == email_or_id or
433
+
user.get('delivery_email') == email_or_id):
434
+
return str(user.get('user_id')), user.get('full_name', '')
438
+
except Exception as e:
439
+
self.logger.error(f"Error looking up user info for '{email_or_id}': {e}")
442
+
def _load_posted_entries(self, bot_handler: BotHandler) -> None:
443
+
"""Load the set of already posted entries."""
445
+
posted_data = bot_handler.storage.get("posted_entries")
447
+
self.posted_entries = set(json.loads(posted_data))
449
+
# Empty set on first run is expected
450
+
self.posted_entries = set()
452
+
def _save_posted_entries(self, bot_handler: BotHandler) -> None:
453
+
"""Save the set of posted entries."""
455
+
bot_handler.storage.put("posted_entries", json.dumps(list(self.posted_entries)))
456
+
except Exception as e:
457
+
self.logger.error(f"Error saving posted entries: {e}")
459
+
def _check_initialization(self, message: Dict[str, Any], bot_handler: BotHandler) -> bool:
460
+
"""Check if thicket is properly initialized."""
461
+
if not self.git_store or not self.config:
462
+
bot_handler.send_reply(
464
+
"โ Thicket not initialized. Please check configuration."
468
+
# In debug mode, we don't need stream/topic configuration
469
+
if self.debug_user:
472
+
if not self.stream_name or not self.topic_name:
473
+
bot_handler.send_reply(
475
+
"โ Stream and topic must be configured first. Use `@mention config stream <name>` and `@mention config topic <name>`"
481
+
def _schedule_sync(self, bot_handler: BotHandler) -> None:
482
+
"""Schedule periodic sync operations."""
486
+
# Check if we can sync
487
+
can_sync = (self.git_store and
488
+
((self.stream_name and self.topic_name) or
492
+
self._perform_sync(bot_handler)
494
+
time.sleep(self.sync_interval)
495
+
except Exception as e:
496
+
self.logger.error(f"Error in sync loop: {e}")
497
+
time.sleep(60) # Wait before retrying
499
+
# Start background thread
501
+
sync_thread = threading.Thread(target=sync_loop, daemon=True)
502
+
sync_thread.start()
504
+
def _perform_sync(self, bot_handler: BotHandler) -> List[AtomEntry]:
505
+
"""Perform thicket sync and return new entries."""
506
+
if not self.config or not self.git_store:
509
+
new_entries: List[Tuple[AtomEntry, str]] = [] # (entry, username) pairs
510
+
is_first_run = len(self.posted_entries) == 0
512
+
# Get all users and their feeds from git store
513
+
users_with_feeds = self.git_store.list_all_users_with_feeds()
515
+
# Sync each user's feeds
516
+
for username, feed_urls in users_with_feeds:
517
+
for feed_url in feed_urls:
519
+
# Run async sync function
520
+
loop = asyncio.new_event_loop()
521
+
asyncio.set_event_loop(loop)
523
+
new_count, _ = loop.run_until_complete(
524
+
sync_feed(self.git_store, username, str(feed_url), dry_run=False)
527
+
entries_to_check = []
530
+
# Get the newly added entries
531
+
entries_to_check = self.git_store.list_entries(username, limit=new_count)
533
+
# Always check for catchup mode on first run
535
+
# Catchup mode: get last 5 entries on first run
536
+
catchup_entries = self.git_store.list_entries(username, limit=5)
537
+
entries_to_check = catchup_entries if not entries_to_check else entries_to_check
539
+
for entry in entries_to_check:
540
+
entry_key = f"{username}:{entry.id}"
541
+
if entry_key not in self.posted_entries:
542
+
new_entries.append((entry, username))
543
+
if len(new_entries) >= self.max_entries_per_sync:
549
+
except Exception as e:
550
+
self.logger.error(f"Error syncing feed {feed_url} for user {username}: {e}")
552
+
if len(new_entries) >= self.max_entries_per_sync:
555
+
# Post new entries to Zulip with rate limiting
559
+
for i, (entry, username) in enumerate(new_entries):
560
+
self._post_entry_to_zulip(entry, bot_handler, username)
561
+
self.posted_entries.add(f"{username}:{entry.id}")
564
+
# Rate limiting: pause after every 5 messages
565
+
if posted_count % 5 == 0 and i < len(new_entries) - 1:
568
+
self._save_posted_entries(bot_handler)
570
+
return [entry for entry, _ in new_entries]
572
+
def _post_entry_to_zulip(self, entry: AtomEntry, bot_handler: BotHandler, username: str) -> None:
573
+
"""Post a single entry to the configured Zulip stream/topic or debug user DM."""
575
+
# Get current Zulip server from environment
576
+
zulip_site_url = os.getenv("THICKET_ZULIP_SITE_URL", "")
577
+
server_url = zulip_site_url.replace("https://", "").replace("http://", "")
579
+
# Build author/date info consistently
581
+
if server_url and self.git_store:
582
+
user = self.git_store.get_user(username)
584
+
zulip_user_id = user.get_zulip_mention(server_url)
586
+
# Look up the actual Zulip full name for proper @mention
587
+
_, zulip_full_name = self._lookup_zulip_user_info(bot_handler, zulip_user_id)
588
+
display_name = zulip_full_name or user.display_name or username
590
+
# Check if author is different from the user - avoid redundancy
591
+
author_name = entry.author and entry.author.get("name")
592
+
if author_name and author_name.lower() != display_name.lower():
593
+
author_info = f" (by {author_name})"
597
+
published_info = ""
598
+
if entry.published:
599
+
published_info = f" โข {entry.published.strftime('%Y-%m-%d')}"
601
+
mention_info = f"@**{display_name}** posted{author_info}{published_info}:\n\n"
603
+
# If no Zulip user found, use consistent format without @mention
604
+
if not mention_info:
605
+
user = self.git_store.get_user(username) if self.git_store else None
606
+
display_name = user.display_name if user else username
608
+
author_name = entry.author and entry.author.get("name")
609
+
if author_name and author_name.lower() != display_name.lower():
610
+
author_info = f" (by {author_name})"
614
+
published_info = ""
615
+
if entry.published:
616
+
published_info = f" โข {entry.published.strftime('%Y-%m-%d')}"
618
+
mention_info = f"**{display_name}** posted{author_info}{published_info}:\n\n"
620
+
# Format the message with HTML processing
622
+
f"**{entry.title}**",
623
+
f"๐ {entry.link}",
627
+
# Process HTML in summary and truncate if needed
628
+
processed_summary = self._process_html_content(entry.summary)
629
+
if len(processed_summary) > 400:
630
+
processed_summary = processed_summary[:397] + "..."
631
+
message_lines.append(f"\n{processed_summary}")
633
+
message_content = mention_info + "\n".join(message_lines)
635
+
# Choose destination based on mode
636
+
if self.debug_user and self.debug_zulip_user_id:
637
+
# Debug mode: send DM
638
+
debug_message = f"๐ **DEBUG:** New article from thicket user `{username}`:\n\n{message_content}"
640
+
# Ensure we have the numeric user ID
641
+
user_id_to_use = self.debug_zulip_user_id
642
+
if not user_id_to_use.isdigit():
643
+
# Need to look up the numeric ID
644
+
resolved_id = self._lookup_zulip_user_id(bot_handler, user_id_to_use)
646
+
user_id_to_use = resolved_id
647
+
self.logger.debug(f"Resolved {self.debug_zulip_user_id} to user ID {user_id_to_use}")
649
+
self.logger.error(f"Could not resolve user ID for {self.debug_zulip_user_id}")
653
+
# For private messages, user_id needs to be an integer, not string
654
+
user_id_int = int(user_id_to_use)
655
+
bot_handler.send_message({
657
+
"to": [user_id_int], # Use integer user ID
658
+
"content": debug_message
661
+
# If conversion to int fails, user_id_to_use might be an email
663
+
bot_handler.send_message({
665
+
"to": [user_id_to_use], # Try as string (email)
666
+
"content": debug_message
668
+
except Exception as e2:
669
+
self.logger.error(f"Failed to send DM to {self.debug_user} (tried both int and string): {e2}")
671
+
except Exception as e:
672
+
self.logger.error(f"Failed to send DM to {self.debug_user} ({user_id_to_use}): {e}")
674
+
self.logger.info(f"Posted entry to debug user {self.debug_user}: {entry.title}")
676
+
# Normal mode: send to stream/topic
677
+
bot_handler.send_message({
679
+
"to": self.stream_name,
680
+
"subject": self.topic_name,
681
+
"content": message_content
683
+
self.logger.info(f"Posted entry to stream: {entry.title} (user: {username})")
685
+
except Exception as e:
686
+
self.logger.error(f"Error posting entry to Zulip: {e}")
688
+
def _process_html_content(self, html_content: str) -> str:
689
+
"""Process HTML content from feeds to clean Zulip-compatible markdown."""
690
+
if not html_content:
694
+
# Try to use markdownify for proper HTML to Markdown conversion
695
+
from markdownify import markdownify as md
697
+
# Convert HTML to Markdown with compact settings for summaries
700
+
heading_style="ATX", # Use # for headings (but we'll post-process these)
701
+
bullets="-", # Use - for bullets
702
+
convert=['a', 'b', 'strong', 'i', 'em', 'code', 'pre', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
705
+
# Post-process to convert headings to bold for compact summaries
707
+
# Convert markdown headers to bold with period
708
+
markdown = re.sub(r'^#{1,6}\s*(.+)$', r'**\1.**', markdown, flags=re.MULTILINE)
710
+
# Clean up excessive newlines and make more compact
711
+
markdown = re.sub(r'\n\s*\n\s*\n+', ' ', markdown) # Multiple newlines become space
712
+
markdown = re.sub(r'\n\s*\n', '. ', markdown) # Double newlines become sentence breaks
713
+
markdown = re.sub(r'\n', ' ', markdown) # Single newlines become spaces
715
+
# Clean up double periods and excessive whitespace
716
+
markdown = re.sub(r'\.\.+', '.', markdown)
717
+
markdown = re.sub(r'\s+', ' ', markdown)
718
+
return markdown.strip()
720
+
except ImportError:
721
+
# Fallback: manual HTML processing
723
+
content = html_content
725
+
# Convert headings to bold with periods for compact summaries
726
+
content = re.sub(r'<h[1-6](?:\s[^>]*)?>([^<]*)</h[1-6]>', r'**\1.** ', content, flags=re.IGNORECASE)
728
+
# Convert common HTML elements to Markdown
729
+
content = re.sub(r'<(?:strong|b)(?:\s[^>]*)?>([^<]*)</(?:strong|b)>', r'**\1**', content, flags=re.IGNORECASE)
730
+
content = re.sub(r'<(?:em|i)(?:\s[^>]*)?>([^<]*)</(?:em|i)>', r'*\1*', content, flags=re.IGNORECASE)
731
+
content = re.sub(r'<code(?:\s[^>]*)?>([^<]*)</code>', r'`\1`', content, flags=re.IGNORECASE)
732
+
content = re.sub(r'<a(?:\s[^>]*?)?\s*href=["\']([^"\']*)["\'](?:\s[^>]*)?>([^<]*)</a>', r'[\2](\1)', content, flags=re.IGNORECASE)
734
+
# Convert block elements to spaces instead of newlines for compactness
735
+
content = re.sub(r'<br\s*/?>', ' ', content, flags=re.IGNORECASE)
736
+
content = re.sub(r'</p>\s*<p>', '. ', content, flags=re.IGNORECASE)
737
+
content = re.sub(r'</?(?:p|div)(?:\s[^>]*)?>', ' ', content, flags=re.IGNORECASE)
739
+
# Remove remaining HTML tags
740
+
content = re.sub(r'<[^>]+>', '', content)
742
+
# Clean up whitespace and make compact
743
+
content = re.sub(r'\s+', ' ', content) # Multiple whitespace becomes single space
744
+
content = re.sub(r'\.\.+', '.', content) # Multiple periods become single period
745
+
return content.strip()
747
+
except Exception as e:
748
+
self.logger.error(f"Error processing HTML content: {e}")
749
+
# Last resort: just strip HTML tags
751
+
return re.sub(r'<[^>]+>', '', html_content).strip()
754
+
handler_class = ThicketBotHandler