Manage Atom feeds in a persistent git repository
at old-main 221 kB view raw
1This file is a merged representation of the entire codebase, combined into a single document by Repomix. 2 3<file_summary> 4This section contains a summary of this file. 5 6<purpose> 7This file contains a packed representation of the entire repository's contents. 8It is designed to be easily consumable by AI systems for analysis, code review, 9or other automated processes. 10</purpose> 11 12<file_format> 13The content is organized as follows: 141. This summary section 152. Repository information 163. Directory structure 174. Repository files (if enabled) 185. Multiple file entries, each consisting of: 19 - File path as an attribute 20 - Full contents of the file 21</file_format> 22 23<usage_guidelines> 24- This file should be treated as read-only. Any changes should be made to the 25 original repository files, not this packed version. 26- When processing this file, use the file path to distinguish 27 between different files in the repository. 28- Be aware that this file may contain sensitive information. Handle it with 29 the same level of security as you would the original repository. 30</usage_guidelines> 31 32<notes> 33- Some files may have been excluded based on .gitignore rules and Repomix's configuration 34- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files 35- Files matching patterns in .gitignore are excluded 36- Files matching default ignore patterns are excluded 37- Files are sorted by Git change count (files with more changes are at the bottom) 38</notes> 39 40</file_summary> 41 42<directory_structure> 43.claude/ 44 settings.local.json 45src/ 46 thicket/ 47 cli/ 48 commands/ 49 __init__.py 50 add.py 51 duplicates.py 52 generate.py 53 index_cmd.py 54 info_cmd.py 55 init.py 56 links_cmd.py 57 list_cmd.py 58 sync.py 59 __init__.py 60 main.py 61 utils.py 62 core/ 63 __init__.py 64 feed_parser.py 65 git_store.py 66 reference_parser.py 67 models/ 68 __init__.py 69 config.py 70 feed.py 71 user.py 72 templates/ 73 base.html 74 index.html 75 links.html 76 script.js 77 style.css 78 timeline.html 79 users.html 80 utils/ 81 __init__.py 82 __init__.py 83 __main__.py 84.gitignore 85ARCH.md 86CLAUDE.md 87pyproject.toml 88README.md 89</directory_structure> 90 91<files> 92This section contains the contents of the repository's files. 93 94<file path=".claude/settings.local.json"> 95{ 96 "permissions": { 97 "allow": [ 98 "Bash(find:*)", 99 "Bash(uv run:*)", 100 "Bash(grep:*)", 101 "Bash(jq:*)", 102 "Bash(git add:*)", 103 "Bash(ls:*)" 104 ] 105 }, 106 "enableAllProjectMcpServers": false 107} 108</file> 109 110<file path="src/thicket/cli/commands/generate.py"> 111"""Generate static HTML website from thicket data.""" 112 113import base64 114import json 115import re 116import shutil 117from datetime import datetime 118from pathlib import Path 119from typing import Any, Optional, TypedDict, Union 120 121import typer 122from jinja2 import Environment, FileSystemLoader, select_autoescape 123from rich.progress import Progress, SpinnerColumn, TextColumn 124 125from ...core.git_store import GitStore 126from ...models.feed import AtomEntry 127from ...models.user import GitStoreIndex, UserMetadata 128from ..main import app 129from ..utils import console, load_config 130 131 132class UserData(TypedDict): 133 """Type definition for user data structure.""" 134 135 metadata: UserMetadata 136 recent_entries: list[tuple[str, AtomEntry]] 137 138 139def safe_anchor_id(atom_id: str) -> str: 140 """Convert an Atom ID to a safe HTML anchor ID.""" 141 # Use base64 URL-safe encoding without padding 142 encoded = base64.urlsafe_b64encode(atom_id.encode('utf-8')).decode('ascii').rstrip('=') 143 # Prefix with 'id' to ensure it starts with a letter (HTML requirement) 144 return f"id{encoded}" 145 146 147class WebsiteGenerator: 148 """Generate static HTML website from thicket data.""" 149 150 def __init__(self, git_store: GitStore, output_dir: Path): 151 self.git_store = git_store 152 self.output_dir = output_dir 153 self.template_dir = Path(__file__).parent.parent.parent / "templates" 154 155 # Initialize Jinja2 environment 156 self.env = Environment( 157 loader=FileSystemLoader(self.template_dir), 158 autoescape=select_autoescape(["html", "xml"]), 159 ) 160 161 # Data containers 162 self.index: Optional[GitStoreIndex] = None 163 self.entries: list[tuple[str, AtomEntry]] = [] # (username, entry) 164 self.links_data: Optional[dict[str, Any]] = None 165 self.threads: list[list[dict[str, Any]]] = [] # List of threads with metadata 166 167 def get_display_name(self, username: str) -> str: 168 """Get display name for a user, falling back to username.""" 169 if self.index and username in self.index.users: 170 user = self.index.users[username] 171 return user.display_name or username 172 return username 173 174 def get_user_homepage(self, username: str) -> Optional[str]: 175 """Get homepage URL for a user.""" 176 if self.index and username in self.index.users: 177 user = self.index.users[username] 178 return str(user.homepage) if user.homepage else None 179 return None 180 181 def clean_html_summary(self, content: Optional[str], max_length: int = 200) -> str: 182 """Clean HTML content and truncate for display in timeline.""" 183 if not content: 184 return "" 185 186 # Remove HTML tags 187 clean_text = re.sub(r"<[^>]+>", " ", content) 188 # Replace multiple whitespace with single space 189 clean_text = re.sub(r"\s+", " ", clean_text) 190 # Strip leading/trailing whitespace 191 clean_text = clean_text.strip() 192 193 # Truncate with ellipsis if needed 194 if len(clean_text) > max_length: 195 # Try to break at word boundary 196 truncated = clean_text[:max_length] 197 last_space = truncated.rfind(" ") 198 if ( 199 last_space > max_length * 0.8 200 ): # If we can break reasonably close to the limit 201 clean_text = truncated[:last_space] + "..." 202 else: 203 clean_text = truncated + "..." 204 205 return clean_text 206 207 def load_data(self) -> None: 208 """Load all data from the git repository.""" 209 with Progress( 210 SpinnerColumn(), 211 TextColumn("[progress.description]{task.description}"), 212 console=console, 213 ) as progress: 214 # Load index 215 task = progress.add_task("Loading repository index...", total=None) 216 self.index = self.git_store._load_index() 217 if not self.index: 218 raise ValueError("No index found in repository") 219 progress.update(task, completed=True) 220 221 # Load all entries 222 task = progress.add_task("Loading entries...", total=None) 223 for username, user_metadata in self.index.users.items(): 224 user_dir = self.git_store.repo_path / user_metadata.directory 225 if user_dir.exists(): 226 for entry_file in user_dir.glob("*.json"): 227 if entry_file.name not in ["index.json", "duplicates.json"]: 228 try: 229 with open(entry_file) as f: 230 entry_data = json.load(f) 231 entry = AtomEntry(**entry_data) 232 self.entries.append((username, entry)) 233 except Exception as e: 234 console.print( 235 f"[yellow]Warning: Failed to load {entry_file}: {e}[/yellow]" 236 ) 237 progress.update(task, completed=True) 238 239 # Sort entries by date (newest first) - prioritize updated over published 240 self.entries.sort( 241 key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True 242 ) 243 244 # Load links data 245 task = progress.add_task("Loading links and references...", total=None) 246 links_file = self.git_store.repo_path / "links.json" 247 if links_file.exists(): 248 with open(links_file) as f: 249 self.links_data = json.load(f) 250 progress.update(task, completed=True) 251 252 def build_threads(self) -> None: 253 """Build threaded conversations from references.""" 254 if not self.links_data or "references" not in self.links_data: 255 return 256 257 # Map entry IDs to (username, entry) tuples 258 entry_map: dict[str, tuple[str, AtomEntry]] = {} 259 for username, entry in self.entries: 260 entry_map[entry.id] = (username, entry) 261 262 # Build adjacency lists for references 263 self.outbound_refs: dict[str, set[str]] = {} 264 self.inbound_refs: dict[str, set[str]] = {} 265 self.reference_details: dict[ 266 str, list[dict[str, Any]] 267 ] = {} # Store full reference info 268 269 for ref in self.links_data["references"]: 270 source_id = ref["source_entry_id"] 271 target_id = ref.get("target_entry_id") 272 273 if target_id and source_id in entry_map and target_id in entry_map: 274 self.outbound_refs.setdefault(source_id, set()).add(target_id) 275 self.inbound_refs.setdefault(target_id, set()).add(source_id) 276 277 # Store reference details for UI 278 self.reference_details.setdefault(source_id, []).append( 279 { 280 "target_id": target_id, 281 "target_username": ref.get("target_username"), 282 "type": "outbound", 283 } 284 ) 285 self.reference_details.setdefault(target_id, []).append( 286 { 287 "source_id": source_id, 288 "source_username": ref.get("source_username"), 289 "type": "inbound", 290 } 291 ) 292 293 # Find conversation threads (multi-post discussions) 294 processed = set() 295 296 for entry_id, (_username, _entry) in entry_map.items(): 297 if entry_id in processed: 298 continue 299 300 # Build thread starting from this entry 301 thread = [] 302 to_visit = [entry_id] 303 thread_ids = set() 304 level_map: dict[str, int] = {} # Track levels for this thread 305 306 # First, traverse up to find the root 307 current = entry_id 308 while current in self.inbound_refs: 309 parents = self.inbound_refs[current] - { 310 current 311 } # Exclude self-references 312 if not parents: 313 break 314 # Take the first parent 315 parent = next(iter(parents)) 316 if parent in thread_ids: # Avoid cycles 317 break 318 current = parent 319 to_visit.insert(0, current) 320 321 # Now traverse down from the root 322 while to_visit: 323 current = to_visit.pop(0) 324 if current in thread_ids or current not in entry_map: 325 continue 326 327 thread_ids.add(current) 328 username, entry = entry_map[current] 329 330 # Calculate thread level 331 thread_level = self._calculate_thread_level(current, level_map) 332 333 # Add threading metadata 334 thread_entry = { 335 "username": username, 336 "display_name": self.get_display_name(username), 337 "entry": entry, 338 "entry_id": current, 339 "references_to": list(self.outbound_refs.get(current, [])), 340 "referenced_by": list(self.inbound_refs.get(current, [])), 341 "thread_level": thread_level, 342 } 343 thread.append(thread_entry) 344 processed.add(current) 345 346 # Add children 347 if current in self.outbound_refs: 348 children = self.outbound_refs[current] - thread_ids # Avoid cycles 349 to_visit.extend(sorted(children)) 350 351 if len(thread) > 1: # Only keep actual threads 352 # Sort thread by date (newest first) - prioritize updated over published 353 thread.sort(key=lambda x: x["entry"].updated or x["entry"].published or datetime.min, reverse=True) # type: ignore 354 self.threads.append(thread) 355 356 # Sort threads by the date of their most recent entry - prioritize updated over published 357 self.threads.sort( 358 key=lambda t: max( 359 item["entry"].updated or item["entry"].published or datetime.min for item in t 360 ), 361 reverse=True, 362 ) 363 364 def _calculate_thread_level( 365 self, entry_id: str, processed_entries: dict[str, int] 366 ) -> int: 367 """Calculate indentation level for threaded display.""" 368 if entry_id in processed_entries: 369 return processed_entries[entry_id] 370 371 if entry_id not in self.inbound_refs: 372 processed_entries[entry_id] = 0 373 return 0 374 375 parents_in_thread = self.inbound_refs[entry_id] & set(processed_entries.keys()) 376 if not parents_in_thread: 377 processed_entries[entry_id] = 0 378 return 0 379 380 # Find the deepest parent level + 1 381 max_parent_level = 0 382 for parent_id in parents_in_thread: 383 parent_level = self._calculate_thread_level(parent_id, processed_entries) 384 max_parent_level = max(max_parent_level, parent_level) 385 386 level = min(max_parent_level + 1, 4) # Cap at level 4 387 processed_entries[entry_id] = level 388 return level 389 390 def get_standalone_references(self) -> list[dict[str, Any]]: 391 """Get posts that have references but aren't part of multi-post threads.""" 392 if not hasattr(self, "reference_details"): 393 return [] 394 395 threaded_entry_ids = set() 396 for thread in self.threads: 397 for item in thread: 398 threaded_entry_ids.add(item["entry_id"]) 399 400 standalone_refs = [] 401 for username, entry in self.entries: 402 if ( 403 entry.id in self.reference_details 404 and entry.id not in threaded_entry_ids 405 ): 406 refs = self.reference_details[entry.id] 407 # Only include if it has meaningful references (not just self-references) 408 meaningful_refs = [ 409 r 410 for r in refs 411 if r.get("target_id") != entry.id and r.get("source_id") != entry.id 412 ] 413 if meaningful_refs: 414 standalone_refs.append( 415 { 416 "username": username, 417 "display_name": self.get_display_name(username), 418 "entry": entry, 419 "references": meaningful_refs, 420 } 421 ) 422 423 return standalone_refs 424 425 def _add_cross_thread_links(self, timeline_items: list[dict[str, Any]]) -> None: 426 """Add cross-thread linking for entries that appear in multiple threads.""" 427 # Map entry IDs to their positions in the timeline 428 entry_positions: dict[str, list[int]] = {} 429 # Map URLs referenced by entries to the entries that reference them 430 url_references: dict[str, list[tuple[str, int]]] = {} # url -> [(entry_id, position)] 431 432 # First pass: collect all entry IDs, their positions, and referenced URLs 433 for i, item in enumerate(timeline_items): 434 if item["type"] == "post": 435 entry_id = item["content"]["entry"].id 436 entry_positions.setdefault(entry_id, []).append(i) 437 # Track URLs this entry references 438 if entry_id in self.reference_details: 439 for ref in self.reference_details[entry_id]: 440 if ref["type"] == "outbound" and "target_id" in ref: 441 # Find the target entry's URL if available 442 target_entry = self._find_entry_by_id(ref["target_id"]) 443 if target_entry and target_entry.link: 444 url = str(target_entry.link) 445 url_references.setdefault(url, []).append((entry_id, i)) 446 elif item["type"] == "thread": 447 for thread_item in item["content"]: 448 entry_id = thread_item["entry"].id 449 entry_positions.setdefault(entry_id, []).append(i) 450 # Track URLs this entry references 451 if entry_id in self.reference_details: 452 for ref in self.reference_details[entry_id]: 453 if ref["type"] == "outbound" and "target_id" in ref: 454 target_entry = self._find_entry_by_id(ref["target_id"]) 455 if target_entry and target_entry.link: 456 url = str(target_entry.link) 457 url_references.setdefault(url, []).append((entry_id, i)) 458 459 # Build cross-thread connections - only for entries that actually appear multiple times 460 cross_thread_connections: dict[str, set[int]] = {} # entry_id -> set of timeline positions 461 462 # Add connections ONLY for entries that appear multiple times in the timeline 463 for entry_id, positions in entry_positions.items(): 464 if len(positions) > 1: 465 cross_thread_connections[entry_id] = set(positions) 466 # Debug: uncomment to see which entries have multiple appearances 467 # print(f"Entry {entry_id[:50]}... appears at positions: {positions}") 468 469 # Apply cross-thread links to timeline items 470 for entry_id, positions_set in cross_thread_connections.items(): 471 positions_list = list(positions_set) 472 for pos in positions_list: 473 item = timeline_items[pos] 474 other_positions = sorted([p for p in positions_list if p != pos]) 475 476 if item["type"] == "post": 477 # Add cross-thread info to individual posts 478 item["content"]["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items) 479 # Add info about shared references 480 item["content"]["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items) 481 elif item["type"] == "thread": 482 # Add cross-thread info to thread items 483 for thread_item in item["content"]: 484 if thread_item["entry"].id == entry_id: 485 thread_item["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items) 486 thread_item["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items) 487 break 488 489 def _build_cross_thread_link_data(self, entry_id: str, other_positions: list[int], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]: 490 """Build detailed cross-thread link data with anchor information.""" 491 cross_thread_links = [] 492 493 for pos in other_positions: 494 item = timeline_items[pos] 495 if item["type"] == "post": 496 # For individual posts 497 safe_id = safe_anchor_id(entry_id) 498 cross_thread_links.append({ 499 "position": pos, 500 "anchor_id": f"post-{pos}-{safe_id}", 501 "context": "individual post", 502 "title": item["content"]["entry"].title 503 }) 504 elif item["type"] == "thread": 505 # For thread items, find the specific thread item 506 for thread_idx, thread_item in enumerate(item["content"]): 507 if thread_item["entry"].id == entry_id: 508 safe_id = safe_anchor_id(entry_id) 509 cross_thread_links.append({ 510 "position": pos, 511 "anchor_id": f"post-{pos}-{thread_idx}-{safe_id}", 512 "context": f"thread (level {thread_item.get('thread_level', 0)})", 513 "title": thread_item["entry"].title 514 }) 515 break 516 517 return cross_thread_links 518 519 def _find_entry_by_id(self, entry_id: str) -> Optional[AtomEntry]: 520 """Find an entry by its ID.""" 521 for _username, entry in self.entries: 522 if entry.id == entry_id: 523 return entry 524 return None 525 526 def _get_shared_references(self, entry_id: str, positions: Union[set[int], list[int]], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]: 527 """Get information about shared references between cross-thread entries.""" 528 shared_refs = [] 529 530 # Collect all referenced URLs from entries at these positions 531 url_counts: dict[str, int] = {} 532 referencing_entries: dict[str, list[str]] = {} # url -> [entry_ids] 533 534 for pos in positions: 535 item = timeline_items[pos] 536 entries_to_check = [] 537 538 if item["type"] == "post": 539 entries_to_check.append(item["content"]["entry"]) 540 elif item["type"] == "thread": 541 entries_to_check.extend([ti["entry"] for ti in item["content"]]) 542 543 for entry in entries_to_check: 544 if entry.id in self.reference_details: 545 for ref in self.reference_details[entry.id]: 546 if ref["type"] == "outbound" and "target_id" in ref: 547 target_entry = self._find_entry_by_id(ref["target_id"]) 548 if target_entry and target_entry.link: 549 url = str(target_entry.link) 550 url_counts[url] = url_counts.get(url, 0) + 1 551 if url not in referencing_entries: 552 referencing_entries[url] = [] 553 if entry.id not in referencing_entries[url]: 554 referencing_entries[url].append(entry.id) 555 556 # Find URLs referenced by multiple entries 557 for url, count in url_counts.items(): 558 if count > 1 and len(referencing_entries[url]) > 1: 559 # Get the target entry info 560 target_entry = None 561 target_username = None 562 for ref in (self.links_data or {}).get("references", []): 563 if ref.get("target_url") == url: 564 target_username = ref.get("target_username") 565 if ref.get("target_entry_id"): 566 target_entry = self._find_entry_by_id(ref["target_entry_id"]) 567 break 568 569 shared_refs.append({ 570 "url": url, 571 "count": count, 572 "referencing_entries": referencing_entries[url], 573 "target_username": target_username, 574 "target_title": target_entry.title if target_entry else None 575 }) 576 577 return sorted(shared_refs, key=lambda x: x["count"], reverse=True) 578 579 def generate_site(self) -> None: 580 """Generate the static website.""" 581 # Create output directory 582 self.output_dir.mkdir(parents=True, exist_ok=True) 583 584 # Create static directories 585 (self.output_dir / "css").mkdir(exist_ok=True) 586 (self.output_dir / "js").mkdir(exist_ok=True) 587 588 # Generate CSS 589 css_template = self.env.get_template("style.css") 590 css_content = css_template.render() 591 with open(self.output_dir / "css" / "style.css", "w") as f: 592 f.write(css_content) 593 594 # Generate JavaScript 595 js_template = self.env.get_template("script.js") 596 js_content = js_template.render() 597 with open(self.output_dir / "js" / "script.js", "w") as f: 598 f.write(js_content) 599 600 # Prepare common template data 601 base_data = { 602 "title": "Energy & Environment Group", 603 "generated_at": datetime.now().isoformat(), 604 "get_display_name": self.get_display_name, 605 "get_user_homepage": self.get_user_homepage, 606 "clean_html_summary": self.clean_html_summary, 607 "safe_anchor_id": safe_anchor_id, 608 } 609 610 # Build unified timeline 611 timeline_items = [] 612 613 # Only consider the threads that will actually be displayed 614 displayed_threads = self.threads[:20] # Limit to 20 threads 615 616 # Track which entries are part of displayed threads 617 threaded_entry_ids = set() 618 for thread in displayed_threads: 619 for item in thread: 620 threaded_entry_ids.add(item["entry_id"]) 621 622 # Add threads to timeline (using the date of the most recent post) 623 for thread in displayed_threads: 624 most_recent_date = max( 625 item["entry"].updated or item["entry"].published or datetime.min 626 for item in thread 627 ) 628 timeline_items.append({ 629 "type": "thread", 630 "date": most_recent_date, 631 "content": thread 632 }) 633 634 # Add individual posts (not in threads) 635 for username, entry in self.entries[:50]: 636 if entry.id not in threaded_entry_ids: 637 # Check if this entry has references 638 has_refs = ( 639 entry.id in self.reference_details 640 if hasattr(self, "reference_details") 641 else False 642 ) 643 644 refs = [] 645 if has_refs: 646 refs = self.reference_details.get(entry.id, []) 647 refs = [ 648 r for r in refs 649 if r.get("target_id") != entry.id 650 and r.get("source_id") != entry.id 651 ] 652 653 timeline_items.append({ 654 "type": "post", 655 "date": entry.updated or entry.published or datetime.min, 656 "content": { 657 "username": username, 658 "display_name": self.get_display_name(username), 659 "entry": entry, 660 "references": refs if refs else None 661 } 662 }) 663 664 # Sort unified timeline by date (newest first) 665 timeline_items.sort(key=lambda x: x["date"], reverse=True) 666 667 # Limit timeline to what will actually be rendered 668 timeline_items = timeline_items[:50] # Limit to 50 items total 669 670 # Add cross-thread linking for repeat blog references 671 self._add_cross_thread_links(timeline_items) 672 673 # Prepare outgoing links data 674 outgoing_links = [] 675 if self.links_data and "links" in self.links_data: 676 for url, link_info in self.links_data["links"].items(): 677 referencing_entries = [] 678 for entry_id in link_info.get("referencing_entries", []): 679 for username, entry in self.entries: 680 if entry.id == entry_id: 681 referencing_entries.append( 682 (self.get_display_name(username), entry) 683 ) 684 break 685 686 if referencing_entries: 687 # Sort by date - prioritize updated over published 688 referencing_entries.sort( 689 key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True 690 ) 691 outgoing_links.append( 692 { 693 "url": url, 694 "target_username": link_info.get("target_username"), 695 "entries": referencing_entries, 696 } 697 ) 698 699 # Sort links by most recent reference - prioritize updated over published 700 outgoing_links.sort( 701 key=lambda x: x["entries"][0][1].updated 702 or x["entries"][0][1].published or datetime.min, 703 reverse=True, 704 ) 705 706 # Prepare users data 707 users: list[UserData] = [] 708 if self.index: 709 for username, user_metadata in self.index.users.items(): 710 # Get recent entries for this user with display names 711 user_entries = [ 712 (self.get_display_name(u), e) 713 for u, e in self.entries 714 if u == username 715 ][:5] 716 users.append( 717 {"metadata": user_metadata, "recent_entries": user_entries} 718 ) 719 # Sort by entry count 720 users.sort(key=lambda x: x["metadata"].entry_count, reverse=True) 721 722 # Generate timeline page 723 timeline_template = self.env.get_template("timeline.html") 724 timeline_content = timeline_template.render( 725 **base_data, 726 page="timeline", 727 timeline_items=timeline_items, # Already limited above 728 ) 729 with open(self.output_dir / "timeline.html", "w") as f: 730 f.write(timeline_content) 731 732 # Generate links page 733 links_template = self.env.get_template("links.html") 734 links_content = links_template.render( 735 **base_data, 736 page="links", 737 outgoing_links=outgoing_links[:100], 738 ) 739 with open(self.output_dir / "links.html", "w") as f: 740 f.write(links_content) 741 742 # Generate users page 743 users_template = self.env.get_template("users.html") 744 users_content = users_template.render( 745 **base_data, 746 page="users", 747 users=users, 748 ) 749 with open(self.output_dir / "users.html", "w") as f: 750 f.write(users_content) 751 752 # Generate main index page (redirect to timeline) 753 index_template = self.env.get_template("index.html") 754 index_content = index_template.render(**base_data) 755 with open(self.output_dir / "index.html", "w") as f: 756 f.write(index_content) 757 758 console.print(f"[green]✓[/green] Generated website at {self.output_dir}") 759 console.print(f" - {len(self.entries)} entries") 760 console.print(f" - {len(self.threads)} conversation threads") 761 console.print(f" - {len(outgoing_links)} outgoing links") 762 console.print(f" - {len(users)} users") 763 console.print( 764 " - Generated pages: index.html, timeline.html, links.html, users.html" 765 ) 766 767 768@app.command() 769def generate( 770 output: Path = typer.Option( 771 Path("./thicket-site"), 772 "--output", 773 "-o", 774 help="Output directory for the generated website", 775 ), 776 force: bool = typer.Option( 777 False, "--force", "-f", help="Overwrite existing output directory" 778 ), 779 config_file: Path = typer.Option( 780 Path("thicket.yaml"), "--config", help="Configuration file path" 781 ), 782) -> None: 783 """Generate a static HTML website from thicket data.""" 784 config = load_config(config_file) 785 786 if not config.git_store: 787 console.print("[red]No git store path configured[/red]") 788 raise typer.Exit(1) 789 790 git_store = GitStore(config.git_store) 791 792 # Check if output directory exists 793 if output.exists() and not force: 794 console.print( 795 f"[red]Output directory {output} already exists. Use --force to overwrite.[/red]" 796 ) 797 raise typer.Exit(1) 798 799 # Clean output directory if forcing 800 if output.exists() and force: 801 shutil.rmtree(output) 802 803 try: 804 generator = WebsiteGenerator(git_store, output) 805 806 console.print("[bold]Generating static website...[/bold]") 807 generator.load_data() 808 generator.build_threads() 809 generator.generate_site() 810 811 except Exception as e: 812 console.print(f"[red]Error generating website: {e}[/red]") 813 raise typer.Exit(1) from e 814</file> 815 816<file path="src/thicket/templates/base.html"> 817<!DOCTYPE html> 818<html lang="en"> 819<head> 820 <meta charset="UTF-8"> 821 <meta name="viewport" content="width=device-width, initial-scale=1.0"> 822 <title>{% block page_title %}{{ title }}{% endblock %}</title> 823 <link rel="stylesheet" href="css/style.css"> 824</head> 825<body> 826 <header class="site-header"> 827 <div class="header-content"> 828 <h1 class="site-title">{{ title }}</h1> 829 <nav class="site-nav"> 830 <a href="timeline.html" class="nav-link {% if page == 'timeline' %}active{% endif %}">Timeline</a> 831 <a href="links.html" class="nav-link {% if page == 'links' %}active{% endif %}">Links</a> 832 <a href="users.html" class="nav-link {% if page == 'users' %}active{% endif %}">Users</a> 833 </nav> 834 </div> 835 </header> 836 837 <main class="main-content"> 838 {% block content %}{% endblock %} 839 </main> 840 841 <footer class="site-footer"> 842 <p>Generated on {{ generated_at }} by <a href="https://github.com/avsm/thicket">Thicket</a></p> 843 </footer> 844 845 <script src="js/script.js"></script> 846</body> 847</html> 848</file> 849 850<file path="src/thicket/templates/index.html"> 851<!DOCTYPE html> 852<html lang="en"> 853<head> 854 <meta charset="UTF-8"> 855 <meta name="viewport" content="width=device-width, initial-scale=1.0"> 856 <title>{{ title }}</title> 857 <meta http-equiv="refresh" content="0; url=timeline.html"> 858 <link rel="canonical" href="timeline.html"> 859</head> 860<body> 861 <p>Redirecting to <a href="timeline.html">Timeline</a>...</p> 862</body> 863</html> 864</file> 865 866<file path="src/thicket/templates/links.html"> 867{% extends "base.html" %} 868 869{% block page_title %}Outgoing Links - {{ title }}{% endblock %} 870 871{% block content %} 872<div class="page-content"> 873 <h2>Outgoing Links</h2> 874 <p class="page-description">External links referenced in blog posts, ordered by most recent reference.</p> 875 876 {% for link in outgoing_links %} 877 <article class="link-group"> 878 <h3 class="link-url"> 879 <a href="{{ link.url }}" target="_blank">{{ link.url|truncate(80) }}</a> 880 {% if link.target_username %} 881 <span class="target-user">({{ link.target_username }})</span> 882 {% endif %} 883 </h3> 884 <div class="referencing-entries"> 885 <span class="ref-count">Referenced in {{ link.entries|length }} post(s):</span> 886 <ul> 887 {% for display_name, entry in link.entries[:5] %} 888 <li> 889 <span class="author">{{ display_name }}</span> - 890 <a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a> 891 <time datetime="{{ entry.updated or entry.published }}"> 892 ({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }}) 893 </time> 894 </li> 895 {% endfor %} 896 {% if link.entries|length > 5 %} 897 <li class="more">... and {{ link.entries|length - 5 }} more</li> 898 {% endif %} 899 </ul> 900 </div> 901 </article> 902 {% endfor %} 903</div> 904{% endblock %} 905</file> 906 907<file path="src/thicket/templates/script.js"> 908// Enhanced functionality for thicket website 909document.addEventListener('DOMContentLoaded', function() { 910 911 // Enhance thread collapsing (optional feature) 912 const threadHeaders = document.querySelectorAll('.thread-header'); 913 threadHeaders.forEach(header => { 914 header.style.cursor = 'pointer'; 915 header.addEventListener('click', function() { 916 const thread = this.parentElement; 917 const entries = thread.querySelectorAll('.thread-entry'); 918 919 // Toggle visibility of all but the first entry 920 for (let i = 1; i < entries.length; i++) { 921 entries[i].style.display = entries[i].style.display === 'none' ? 'block' : 'none'; 922 } 923 924 // Update thread count text 925 const count = this.querySelector('.thread-count'); 926 if (entries[1] && entries[1].style.display === 'none') { 927 count.textContent = count.textContent.replace('posts', 'posts (collapsed)'); 928 } else { 929 count.textContent = count.textContent.replace(' (collapsed)', ''); 930 } 931 }); 932 }); 933 934 // Add relative time display 935 const timeElements = document.querySelectorAll('time'); 936 timeElements.forEach(timeEl => { 937 const datetime = new Date(timeEl.getAttribute('datetime')); 938 const now = new Date(); 939 const diffMs = now - datetime; 940 const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24)); 941 942 let relativeTime; 943 if (diffDays === 0) { 944 const diffHours = Math.floor(diffMs / (1000 * 60 * 60)); 945 if (diffHours === 0) { 946 const diffMinutes = Math.floor(diffMs / (1000 * 60)); 947 relativeTime = diffMinutes === 0 ? 'just now' : `${diffMinutes}m ago`; 948 } else { 949 relativeTime = `${diffHours}h ago`; 950 } 951 } else if (diffDays === 1) { 952 relativeTime = 'yesterday'; 953 } else if (diffDays < 7) { 954 relativeTime = `${diffDays}d ago`; 955 } else if (diffDays < 30) { 956 const weeks = Math.floor(diffDays / 7); 957 relativeTime = weeks === 1 ? '1w ago' : `${weeks}w ago`; 958 } else if (diffDays < 365) { 959 const months = Math.floor(diffDays / 30); 960 relativeTime = months === 1 ? '1mo ago' : `${months}mo ago`; 961 } else { 962 const years = Math.floor(diffDays / 365); 963 relativeTime = years === 1 ? '1y ago' : `${years}y ago`; 964 } 965 966 // Add relative time as title attribute 967 timeEl.setAttribute('title', timeEl.textContent); 968 timeEl.textContent = relativeTime; 969 }); 970 971 // Enhanced anchor link scrolling for shared references 972 document.querySelectorAll('a[href^="#"]').forEach(anchor => { 973 anchor.addEventListener('click', function (e) { 974 e.preventDefault(); 975 const target = document.querySelector(this.getAttribute('href')); 976 if (target) { 977 target.scrollIntoView({ 978 behavior: 'smooth', 979 block: 'center' 980 }); 981 982 // Highlight the target briefly 983 const timelineEntry = target.closest('.timeline-entry'); 984 if (timelineEntry) { 985 timelineEntry.style.outline = '2px solid var(--primary-color)'; 986 timelineEntry.style.borderRadius = '8px'; 987 setTimeout(() => { 988 timelineEntry.style.outline = ''; 989 timelineEntry.style.borderRadius = ''; 990 }, 2000); 991 } 992 } 993 }); 994 }); 995}); 996</file> 997 998<file path="src/thicket/templates/style.css"> 999/* Modern, clean design with high-density text and readable theme */ 1000 1001:root { 1002 --primary-color: #2c3e50; 1003 --secondary-color: #3498db; 1004 --accent-color: #e74c3c; 1005 --background: #ffffff; 1006 --surface: #f8f9fa; 1007 --text-primary: #2c3e50; 1008 --text-secondary: #7f8c8d; 1009 --border-color: #e0e0e0; 1010 --thread-indent: 20px; 1011 --max-width: 1200px; 1012} 1013 1014* { 1015 margin: 0; 1016 padding: 0; 1017 box-sizing: border-box; 1018} 1019 1020body { 1021 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif; 1022 font-size: 14px; 1023 line-height: 1.6; 1024 color: var(--text-primary); 1025 background-color: var(--background); 1026} 1027 1028/* Header */ 1029.site-header { 1030 background-color: var(--surface); 1031 border-bottom: 1px solid var(--border-color); 1032 padding: 0.75rem 0; 1033 position: sticky; 1034 top: 0; 1035 z-index: 100; 1036} 1037 1038.header-content { 1039 max-width: var(--max-width); 1040 margin: 0 auto; 1041 padding: 0 2rem; 1042 display: flex; 1043 justify-content: space-between; 1044 align-items: center; 1045} 1046 1047.site-title { 1048 font-size: 1.5rem; 1049 font-weight: 600; 1050 color: var(--primary-color); 1051 margin: 0; 1052} 1053 1054/* Navigation */ 1055.site-nav { 1056 display: flex; 1057 gap: 1.5rem; 1058} 1059 1060.nav-link { 1061 text-decoration: none; 1062 color: var(--text-secondary); 1063 font-weight: 500; 1064 font-size: 0.95rem; 1065 padding: 0.5rem 0.75rem; 1066 border-radius: 4px; 1067 transition: all 0.2s ease; 1068} 1069 1070.nav-link:hover { 1071 color: var(--primary-color); 1072 background-color: var(--background); 1073} 1074 1075.nav-link.active { 1076 color: var(--secondary-color); 1077 background-color: var(--background); 1078 font-weight: 600; 1079} 1080 1081/* Main Content */ 1082.main-content { 1083 max-width: var(--max-width); 1084 margin: 2rem auto; 1085 padding: 0 2rem; 1086} 1087 1088.page-content { 1089 margin: 0; 1090} 1091 1092.page-description { 1093 color: var(--text-secondary); 1094 margin-bottom: 1.5rem; 1095 font-style: italic; 1096} 1097 1098/* Sections */ 1099section { 1100 margin-bottom: 2rem; 1101} 1102 1103h2 { 1104 font-size: 1.3rem; 1105 font-weight: 600; 1106 margin-bottom: 0.75rem; 1107 color: var(--primary-color); 1108} 1109 1110h3 { 1111 font-size: 1.1rem; 1112 font-weight: 600; 1113 margin-bottom: 0.75rem; 1114 color: var(--primary-color); 1115} 1116 1117/* Entries and Threads */ 1118article { 1119 margin-bottom: 1.5rem; 1120 padding: 1rem; 1121 background-color: var(--surface); 1122 border-radius: 4px; 1123 border: 1px solid var(--border-color); 1124} 1125 1126/* Timeline-style entries */ 1127.timeline-entry { 1128 margin-bottom: 0.5rem; 1129 padding: 0.5rem 0.75rem; 1130 border: none; 1131 background: transparent; 1132 transition: background-color 0.2s ease; 1133} 1134 1135.timeline-entry:hover { 1136 background-color: var(--surface); 1137} 1138 1139.timeline-meta { 1140 display: inline-flex; 1141 gap: 0.5rem; 1142 align-items: center; 1143 font-size: 0.75rem; 1144 color: var(--text-secondary); 1145 margin-bottom: 0.25rem; 1146} 1147 1148.timeline-time { 1149 font-family: 'SF Mono', Monaco, Consolas, 'Courier New', monospace; 1150 font-size: 0.75rem; 1151 color: var(--text-secondary); 1152} 1153 1154.timeline-author { 1155 font-weight: 600; 1156 color: var(--primary-color); 1157 font-size: 0.8rem; 1158 text-decoration: none; 1159} 1160 1161.timeline-author:hover { 1162 color: var(--secondary-color); 1163 text-decoration: underline; 1164} 1165 1166.timeline-content { 1167 line-height: 1.4; 1168} 1169 1170.timeline-title { 1171 font-size: 0.95rem; 1172 font-weight: 600; 1173} 1174 1175.timeline-title a { 1176 color: var(--primary-color); 1177 text-decoration: none; 1178} 1179 1180.timeline-title a:hover { 1181 color: var(--secondary-color); 1182 text-decoration: underline; 1183} 1184 1185.timeline-summary { 1186 color: var(--text-secondary); 1187 font-size: 0.9rem; 1188 line-height: 1.4; 1189} 1190 1191/* Legacy styles for other sections */ 1192.entry-meta, .thread-header { 1193 display: flex; 1194 gap: 1rem; 1195 align-items: center; 1196 margin-bottom: 0.5rem; 1197 font-size: 0.85rem; 1198 color: var(--text-secondary); 1199} 1200 1201.author { 1202 font-weight: 600; 1203 color: var(--primary-color); 1204} 1205 1206time { 1207 font-size: 0.85rem; 1208} 1209 1210h4 { 1211 font-size: 1.1rem; 1212 font-weight: 600; 1213 margin-bottom: 0.5rem; 1214} 1215 1216h4 a { 1217 color: var(--primary-color); 1218 text-decoration: none; 1219} 1220 1221h4 a:hover { 1222 color: var(--secondary-color); 1223 text-decoration: underline; 1224} 1225 1226.entry-summary { 1227 color: var(--text-primary); 1228 line-height: 1.5; 1229 margin-top: 0.5rem; 1230} 1231 1232/* Enhanced Threading Styles */ 1233 1234/* Conversation Clusters */ 1235.conversation-cluster { 1236 background-color: var(--background); 1237 border: 2px solid var(--border-color); 1238 border-radius: 8px; 1239 margin-bottom: 2rem; 1240 overflow: hidden; 1241 box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); 1242} 1243 1244.conversation-header { 1245 background: linear-gradient(135deg, var(--surface) 0%, #f1f3f4 100%); 1246 padding: 0.75rem 1rem; 1247 border-bottom: 1px solid var(--border-color); 1248} 1249 1250.conversation-meta { 1251 display: flex; 1252 justify-content: space-between; 1253 align-items: center; 1254 flex-wrap: wrap; 1255 gap: 0.5rem; 1256} 1257 1258.conversation-count { 1259 font-weight: 600; 1260 color: var(--secondary-color); 1261 font-size: 0.9rem; 1262} 1263 1264.conversation-participants { 1265 font-size: 0.8rem; 1266 color: var(--text-secondary); 1267 flex: 1; 1268 text-align: right; 1269} 1270 1271.conversation-flow { 1272 padding: 0.5rem; 1273} 1274 1275/* Threaded Conversation Entries */ 1276.conversation-entry { 1277 position: relative; 1278 margin-bottom: 0.75rem; 1279 display: flex; 1280 align-items: flex-start; 1281} 1282 1283.conversation-entry.level-0 { 1284 margin-left: 0; 1285} 1286 1287.conversation-entry.level-1 { 1288 margin-left: 1.5rem; 1289} 1290 1291.conversation-entry.level-2 { 1292 margin-left: 3rem; 1293} 1294 1295.conversation-entry.level-3 { 1296 margin-left: 4.5rem; 1297} 1298 1299.conversation-entry.level-4 { 1300 margin-left: 6rem; 1301} 1302 1303.entry-connector { 1304 width: 3px; 1305 background-color: var(--secondary-color); 1306 margin-right: 0.75rem; 1307 margin-top: 0.25rem; 1308 min-height: 2rem; 1309 border-radius: 2px; 1310 opacity: 0.6; 1311} 1312 1313.conversation-entry.level-0 .entry-connector { 1314 background-color: var(--accent-color); 1315 opacity: 0.8; 1316} 1317 1318.entry-content { 1319 flex: 1; 1320 background-color: var(--surface); 1321 padding: 0.75rem; 1322 border-radius: 6px; 1323 border: 1px solid var(--border-color); 1324 transition: all 0.2s ease; 1325} 1326 1327.entry-content:hover { 1328 border-color: var(--secondary-color); 1329 box-shadow: 0 2px 8px rgba(52, 152, 219, 0.1); 1330} 1331 1332/* Reference Indicators */ 1333.reference-indicators { 1334 display: inline-flex; 1335 gap: 0.25rem; 1336 margin-left: 0.5rem; 1337} 1338 1339.ref-out, .ref-in { 1340 display: inline-block; 1341 width: 1rem; 1342 height: 1rem; 1343 border-radius: 50%; 1344 text-align: center; 1345 line-height: 1rem; 1346 font-size: 0.7rem; 1347 font-weight: bold; 1348} 1349 1350.ref-out { 1351 background-color: #e8f5e8; 1352 color: #2d8f2d; 1353} 1354 1355.ref-in { 1356 background-color: #e8f0ff; 1357 color: #1f5fbf; 1358} 1359 1360/* Reference Badges for Individual Posts */ 1361.timeline-entry.with-references { 1362 background-color: var(--surface); 1363} 1364 1365/* Conversation posts in unified timeline */ 1366.timeline-entry.conversation-post { 1367 background: transparent; 1368 border: none; 1369 margin-bottom: 0.5rem; 1370 padding: 0.5rem 0.75rem; 1371} 1372 1373.timeline-entry.conversation-post.level-0 { 1374 margin-left: 0; 1375 border-left: 2px solid var(--accent-color); 1376 padding-left: 0.75rem; 1377} 1378 1379.timeline-entry.conversation-post.level-1 { 1380 margin-left: 1.5rem; 1381 border-left: 2px solid var(--secondary-color); 1382 padding-left: 0.75rem; 1383} 1384 1385.timeline-entry.conversation-post.level-2 { 1386 margin-left: 3rem; 1387 border-left: 2px solid var(--text-secondary); 1388 padding-left: 0.75rem; 1389} 1390 1391.timeline-entry.conversation-post.level-3 { 1392 margin-left: 4.5rem; 1393 border-left: 2px solid var(--text-secondary); 1394 padding-left: 0.75rem; 1395} 1396 1397.timeline-entry.conversation-post.level-4 { 1398 margin-left: 6rem; 1399 border-left: 2px solid var(--text-secondary); 1400 padding-left: 0.75rem; 1401} 1402 1403/* Cross-thread linking */ 1404.cross-thread-links { 1405 margin-top: 0.5rem; 1406 padding-top: 0.5rem; 1407 border-top: 1px solid var(--border-color); 1408} 1409 1410.cross-thread-indicator { 1411 font-size: 0.75rem; 1412 color: var(--text-secondary); 1413 background-color: var(--surface); 1414 padding: 0.25rem 0.5rem; 1415 border-radius: 12px; 1416 border: 1px solid var(--border-color); 1417 display: inline-block; 1418} 1419 1420/* Inline shared references styling */ 1421.inline-shared-refs { 1422 margin-left: 0.5rem; 1423 font-size: 0.85rem; 1424 color: var(--text-secondary); 1425} 1426 1427.shared-ref-link { 1428 color: var(--primary-color); 1429 text-decoration: none; 1430 font-weight: 500; 1431 transition: color 0.2s ease; 1432} 1433 1434.shared-ref-link:hover { 1435 color: var(--secondary-color); 1436 text-decoration: underline; 1437} 1438 1439.shared-ref-more { 1440 font-style: italic; 1441 color: var(--text-secondary); 1442 font-size: 0.8rem; 1443 margin-left: 0.25rem; 1444} 1445 1446.user-anchor, .post-anchor { 1447 position: absolute; 1448 margin-top: -60px; /* Offset for fixed header */ 1449 pointer-events: none; 1450} 1451 1452.cross-thread-link { 1453 color: var(--primary-color); 1454 text-decoration: none; 1455 font-weight: 500; 1456 transition: color 0.2s ease; 1457} 1458 1459.cross-thread-link:hover { 1460 color: var(--secondary-color); 1461 text-decoration: underline; 1462} 1463 1464.reference-badges { 1465 display: flex; 1466 gap: 0.25rem; 1467 margin-left: 0.5rem; 1468 flex-wrap: wrap; 1469} 1470 1471.ref-badge { 1472 display: inline-block; 1473 padding: 0.1rem 0.4rem; 1474 border-radius: 12px; 1475 font-size: 0.7rem; 1476 font-weight: 600; 1477 text-transform: uppercase; 1478 letter-spacing: 0.05em; 1479} 1480 1481.ref-badge.ref-outbound { 1482 background-color: #e8f5e8; 1483 color: #2d8f2d; 1484 border: 1px solid #c3e6c3; 1485} 1486 1487.ref-badge.ref-inbound { 1488 background-color: #e8f0ff; 1489 color: #1f5fbf; 1490 border: 1px solid #b3d9ff; 1491} 1492 1493/* Author Color Coding */ 1494.timeline-author { 1495 position: relative; 1496} 1497 1498.timeline-author::before { 1499 content: ''; 1500 display: inline-block; 1501 width: 8px; 1502 height: 8px; 1503 border-radius: 50%; 1504 margin-right: 0.5rem; 1505 background-color: var(--secondary-color); 1506} 1507 1508/* Generate consistent colors for authors */ 1509.author-avsm::before { background-color: #e74c3c; } 1510.author-mort::before { background-color: #3498db; } 1511.author-mte::before { background-color: #2ecc71; } 1512.author-ryan::before { background-color: #f39c12; } 1513.author-mwd::before { background-color: #9b59b6; } 1514.author-dra::before { background-color: #1abc9c; } 1515.author-pf341::before { background-color: #34495e; } 1516.author-sadiqj::before { background-color: #e67e22; } 1517.author-martinkl::before { background-color: #8e44ad; } 1518.author-jonsterling::before { background-color: #27ae60; } 1519.author-jon::before { background-color: #f1c40f; } 1520.author-onkar::before { background-color: #e91e63; } 1521.author-gabriel::before { background-color: #00bcd4; } 1522.author-jess::before { background-color: #ff5722; } 1523.author-ibrahim::before { background-color: #607d8b; } 1524.author-andres::before { background-color: #795548; } 1525.author-eeg::before { background-color: #ff9800; } 1526 1527/* Section Headers */ 1528.conversations-section h3, 1529.referenced-posts-section h3, 1530.individual-posts-section h3 { 1531 border-bottom: 2px solid var(--border-color); 1532 padding-bottom: 0.5rem; 1533 margin-bottom: 1.5rem; 1534 position: relative; 1535} 1536 1537.conversations-section h3::before { 1538 content: "💬"; 1539 margin-right: 0.5rem; 1540} 1541 1542.referenced-posts-section h3::before { 1543 content: "🔗"; 1544 margin-right: 0.5rem; 1545} 1546 1547.individual-posts-section h3::before { 1548 content: "📝"; 1549 margin-right: 0.5rem; 1550} 1551 1552/* Legacy thread styles (for backward compatibility) */ 1553.thread { 1554 background-color: var(--background); 1555 border: 1px solid var(--border-color); 1556 padding: 0; 1557 overflow: hidden; 1558 margin-bottom: 1rem; 1559} 1560 1561.thread-header { 1562 background-color: var(--surface); 1563 padding: 0.5rem 0.75rem; 1564 border-bottom: 1px solid var(--border-color); 1565} 1566 1567.thread-count { 1568 font-weight: 600; 1569 color: var(--secondary-color); 1570} 1571 1572.thread-entry { 1573 padding: 0.5rem 0.75rem; 1574 border-bottom: 1px solid var(--border-color); 1575} 1576 1577.thread-entry:last-child { 1578 border-bottom: none; 1579} 1580 1581.thread-entry.reply { 1582 margin-left: var(--thread-indent); 1583 border-left: 3px solid var(--secondary-color); 1584 background-color: var(--surface); 1585} 1586 1587/* Links Section */ 1588.link-group { 1589 background-color: var(--background); 1590} 1591 1592.link-url { 1593 font-size: 1rem; 1594 word-break: break-word; 1595} 1596 1597.link-url a { 1598 color: var(--secondary-color); 1599 text-decoration: none; 1600} 1601 1602.link-url a:hover { 1603 text-decoration: underline; 1604} 1605 1606.target-user { 1607 font-size: 0.9rem; 1608 color: var(--text-secondary); 1609 font-weight: normal; 1610} 1611 1612.referencing-entries { 1613 margin-top: 0.75rem; 1614} 1615 1616.ref-count { 1617 font-weight: 600; 1618 color: var(--text-secondary); 1619 font-size: 0.9rem; 1620} 1621 1622.referencing-entries ul { 1623 list-style: none; 1624 margin-top: 0.5rem; 1625 padding-left: 1rem; 1626} 1627 1628.referencing-entries li { 1629 margin-bottom: 0.25rem; 1630 font-size: 0.9rem; 1631} 1632 1633.referencing-entries .more { 1634 font-style: italic; 1635 color: var(--text-secondary); 1636} 1637 1638/* Users Section */ 1639.user-card { 1640 background-color: var(--background); 1641} 1642 1643.user-header { 1644 display: flex; 1645 gap: 1rem; 1646 align-items: start; 1647 margin-bottom: 1rem; 1648} 1649 1650.user-icon { 1651 width: 48px; 1652 height: 48px; 1653 border-radius: 50%; 1654 object-fit: cover; 1655} 1656 1657.user-info h3 { 1658 margin-bottom: 0.25rem; 1659} 1660 1661.username { 1662 font-size: 0.9rem; 1663 color: var(--text-secondary); 1664 font-weight: normal; 1665} 1666 1667.user-meta { 1668 font-size: 0.9rem; 1669 color: var(--text-secondary); 1670} 1671 1672.user-meta a { 1673 color: var(--secondary-color); 1674 text-decoration: none; 1675} 1676 1677.user-meta a:hover { 1678 text-decoration: underline; 1679} 1680 1681.separator { 1682 margin: 0 0.5rem; 1683} 1684 1685.post-count { 1686 font-weight: 600; 1687} 1688 1689.user-recent h4 { 1690 font-size: 0.95rem; 1691 margin-bottom: 0.5rem; 1692 color: var(--text-secondary); 1693} 1694 1695.user-recent ul { 1696 list-style: none; 1697 padding-left: 0; 1698} 1699 1700.user-recent li { 1701 margin-bottom: 0.25rem; 1702 font-size: 0.9rem; 1703} 1704 1705/* Footer */ 1706.site-footer { 1707 max-width: var(--max-width); 1708 margin: 3rem auto 2rem; 1709 padding: 1rem 2rem; 1710 text-align: center; 1711 color: var(--text-secondary); 1712 font-size: 0.85rem; 1713 border-top: 1px solid var(--border-color); 1714} 1715 1716.site-footer a { 1717 color: var(--secondary-color); 1718 text-decoration: none; 1719} 1720 1721.site-footer a:hover { 1722 text-decoration: underline; 1723} 1724 1725/* Responsive */ 1726@media (max-width: 768px) { 1727 .site-title { 1728 font-size: 1.3rem; 1729 } 1730 1731 .header-content { 1732 flex-direction: column; 1733 gap: 0.75rem; 1734 align-items: flex-start; 1735 } 1736 1737 .site-nav { 1738 gap: 1rem; 1739 } 1740 1741 .main-content { 1742 padding: 0 1rem; 1743 } 1744 1745 .thread-entry.reply { 1746 margin-left: calc(var(--thread-indent) / 2); 1747 } 1748 1749 .user-header { 1750 flex-direction: column; 1751 } 1752} 1753</file> 1754 1755<file path="src/thicket/templates/timeline.html"> 1756{% extends "base.html" %} 1757 1758{% block page_title %}Timeline - {{ title }}{% endblock %} 1759 1760{% block content %} 1761{% set seen_users = [] %} 1762<div class="page-content"> 1763 <h2>Recent Posts & Conversations</h2> 1764 1765 <section class="unified-timeline"> 1766 {% for item in timeline_items %} 1767 {% if item.type == "post" %} 1768 <!-- Individual Post --> 1769 <article class="timeline-entry {% if item.content.references %}with-references{% endif %}"> 1770 <div class="timeline-meta"> 1771 <time datetime="{{ item.content.entry.updated or item.content.entry.published }}" class="timeline-time"> 1772 {{ (item.content.entry.updated or item.content.entry.published).strftime('%Y-%m-%d %H:%M') }} 1773 </time> 1774 {% set homepage = get_user_homepage(item.content.username) %} 1775 {% if item.content.username not in seen_users %} 1776 <a id="{{ item.content.username }}" class="user-anchor"></a> 1777 {% set _ = seen_users.append(item.content.username) %} 1778 {% endif %} 1779 <a id="post-{{ loop.index0 }}-{{ safe_anchor_id(item.content.entry.id) }}" class="post-anchor"></a> 1780 {% if homepage %} 1781 <a href="{{ homepage }}" target="_blank" class="timeline-author">{{ item.content.display_name }}</a> 1782 {% else %} 1783 <span class="timeline-author">{{ item.content.display_name }}</span> 1784 {% endif %} 1785 {% if item.content.references %} 1786 <div class="reference-badges"> 1787 {% for ref in item.content.references %} 1788 {% if ref.type == 'outbound' %} 1789 <span class="ref-badge ref-outbound" title="References {{ ref.target_username or 'external post' }}"> 1790 → {{ ref.target_username or 'ext' }} 1791 </span> 1792 {% elif ref.type == 'inbound' %} 1793 <span class="ref-badge ref-inbound" title="Referenced by {{ ref.source_username or 'external post' }}"> 1794 ← {{ ref.source_username or 'ext' }} 1795 </span> 1796 {% endif %} 1797 {% endfor %} 1798 </div> 1799 {% endif %} 1800 </div> 1801 <div class="timeline-content"> 1802 <strong class="timeline-title"> 1803 <a href="{{ item.content.entry.link }}" target="_blank">{{ item.content.entry.title }}</a> 1804 </strong> 1805 {% if item.content.entry.summary %} 1806 <span class="timeline-summary">— {{ clean_html_summary(item.content.entry.summary, 250) }}</span> 1807 {% endif %} 1808 {% if item.content.shared_references %} 1809 <span class="inline-shared-refs"> 1810 {% for ref in item.content.shared_references[:3] %} 1811 {% if ref.target_username %} 1812 <a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %} 1813 {% endif %} 1814 {% endfor %} 1815 {% if item.content.shared_references|length > 3 %} 1816 <span class="shared-ref-more">+{{ item.content.shared_references|length - 3 }} more</span> 1817 {% endif %} 1818 </span> 1819 {% endif %} 1820 {% if item.content.cross_thread_links %} 1821 <div class="cross-thread-links"> 1822 <span class="cross-thread-indicator">🔗 Also appears: </span> 1823 {% for link in item.content.cross_thread_links %} 1824 <a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %} 1825 {% endfor %} 1826 </div> 1827 {% endif %} 1828 </div> 1829 </article> 1830 1831 {% elif item.type == "thread" %} 1832 <!-- Conversation Thread --> 1833 {% set outer_loop_index = loop.index0 %} 1834 {% for thread_item in item.content %} 1835 <article class="timeline-entry conversation-post level-{{ thread_item.thread_level }}"> 1836 <div class="timeline-meta"> 1837 <time datetime="{{ thread_item.entry.updated or thread_item.entry.published }}" class="timeline-time"> 1838 {{ (thread_item.entry.updated or thread_item.entry.published).strftime('%Y-%m-%d %H:%M') }} 1839 </time> 1840 {% set homepage = get_user_homepage(thread_item.username) %} 1841 {% if thread_item.username not in seen_users %} 1842 <a id="{{ thread_item.username }}" class="user-anchor"></a> 1843 {% set _ = seen_users.append(thread_item.username) %} 1844 {% endif %} 1845 <a id="post-{{ outer_loop_index }}-{{ loop.index0 }}-{{ safe_anchor_id(thread_item.entry.id) }}" class="post-anchor"></a> 1846 {% if homepage %} 1847 <a href="{{ homepage }}" target="_blank" class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</a> 1848 {% else %} 1849 <span class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</span> 1850 {% endif %} 1851 {% if thread_item.references_to or thread_item.referenced_by %} 1852 <span class="reference-indicators"> 1853 {% if thread_item.references_to %} 1854 <span class="ref-out" title="References other posts"></span> 1855 {% endif %} 1856 {% if thread_item.referenced_by %} 1857 <span class="ref-in" title="Referenced by other posts"></span> 1858 {% endif %} 1859 </span> 1860 {% endif %} 1861 </div> 1862 <div class="timeline-content"> 1863 <strong class="timeline-title"> 1864 <a href="{{ thread_item.entry.link }}" target="_blank">{{ thread_item.entry.title }}</a> 1865 </strong> 1866 {% if thread_item.entry.summary %} 1867 <span class="timeline-summary">— {{ clean_html_summary(thread_item.entry.summary, 300) }}</span> 1868 {% endif %} 1869 {% if thread_item.shared_references %} 1870 <span class="inline-shared-refs"> 1871 {% for ref in thread_item.shared_references[:3] %} 1872 {% if ref.target_username %} 1873 <a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %} 1874 {% endif %} 1875 {% endfor %} 1876 {% if thread_item.shared_references|length > 3 %} 1877 <span class="shared-ref-more">+{{ thread_item.shared_references|length - 3 }} more</span> 1878 {% endif %} 1879 </span> 1880 {% endif %} 1881 {% if thread_item.cross_thread_links %} 1882 <div class="cross-thread-links"> 1883 <span class="cross-thread-indicator">🔗 Also appears: </span> 1884 {% for link in thread_item.cross_thread_links %} 1885 <a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %} 1886 {% endfor %} 1887 </div> 1888 {% endif %} 1889 </div> 1890 </article> 1891 {% endfor %} 1892 {% endif %} 1893 {% endfor %} 1894 </section> 1895</div> 1896{% endblock %} 1897</file> 1898 1899<file path="src/thicket/templates/users.html"> 1900{% extends "base.html" %} 1901 1902{% block page_title %}Users - {{ title }}{% endblock %} 1903 1904{% block content %} 1905<div class="page-content"> 1906 <h2>Users</h2> 1907 <p class="page-description">All users contributing to this thicket, ordered by post count.</p> 1908 1909 {% for user_info in users %} 1910 <article class="user-card"> 1911 <div class="user-header"> 1912 {% if user_info.metadata.icon and user_info.metadata.icon != "None" %} 1913 <img src="{{ user_info.metadata.icon }}" alt="{{ user_info.metadata.username }}" class="user-icon"> 1914 {% endif %} 1915 <div class="user-info"> 1916 <h3> 1917 {% if user_info.metadata.display_name %} 1918 {{ user_info.metadata.display_name }} 1919 <span class="username">({{ user_info.metadata.username }})</span> 1920 {% else %} 1921 {{ user_info.metadata.username }} 1922 {% endif %} 1923 </h3> 1924 <div class="user-meta"> 1925 {% if user_info.metadata.homepage %} 1926 <a href="{{ user_info.metadata.homepage }}" target="_blank">{{ user_info.metadata.homepage }}</a> 1927 {% endif %} 1928 {% if user_info.metadata.email %} 1929 <span class="separator"></span> 1930 <a href="mailto:{{ user_info.metadata.email }}">{{ user_info.metadata.email }}</a> 1931 {% endif %} 1932 <span class="separator"></span> 1933 <span class="post-count">{{ user_info.metadata.entry_count }} posts</span> 1934 </div> 1935 </div> 1936 </div> 1937 1938 {% if user_info.recent_entries %} 1939 <div class="user-recent"> 1940 <h4>Recent posts:</h4> 1941 <ul> 1942 {% for display_name, entry in user_info.recent_entries %} 1943 <li> 1944 <a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a> 1945 <time datetime="{{ entry.updated or entry.published }}"> 1946 ({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }}) 1947 </time> 1948 </li> 1949 {% endfor %} 1950 </ul> 1951 </div> 1952 {% endif %} 1953 </article> 1954 {% endfor %} 1955</div> 1956{% endblock %} 1957</file> 1958 1959<file path="README.md"> 1960# Thicket 1961 1962A modern CLI tool for persisting Atom/RSS feeds in Git repositories, designed to enable distributed webblog comment structures. 1963 1964## Features 1965 1966- **Feed Auto-Discovery**: Automatically extracts user metadata from Atom/RSS feeds 1967- **Git Storage**: Stores feed entries in a Git repository with full history 1968- **Duplicate Management**: Manual curation of duplicate entries across feeds 1969- **Modern CLI**: Built with Typer and Rich for beautiful terminal output 1970- **Comprehensive Parsing**: Supports RSS 0.9x, RSS 1.0, RSS 2.0, and Atom feeds 1971- **Cron-Friendly**: Designed for scheduled execution 1972 1973## Installation 1974 1975```bash 1976# Install from source 1977pip install -e . 1978 1979# Or install with dev dependencies 1980pip install -e .[dev] 1981``` 1982 1983## Quick Start 1984 19851. **Initialize a new thicket repository:** 1986```bash 1987thicket init ./my-feeds 1988``` 1989 19902. **Add a user with their feed:** 1991```bash 1992thicket add user "alice" --feed "https://alice.example.com/feed.xml" 1993``` 1994 19953. **Sync feeds to download entries:** 1996```bash 1997thicket sync --all 1998``` 1999 20004. **List users and feeds:** 2001```bash 2002thicket list users 2003thicket list feeds 2004thicket list entries 2005``` 2006 2007## Commands 2008 2009### Initialize 2010```bash 2011thicket init <git-store-path> [--cache-dir <path>] [--config <config-file>] 2012``` 2013 2014### Add Users and Feeds 2015```bash 2016# Add user with auto-discovery 2017thicket add user "username" --feed "https://example.com/feed.xml" 2018 2019# Add user with manual metadata 2020thicket add user "username" \ 2021 --feed "https://example.com/feed.xml" \ 2022 --email "user@example.com" \ 2023 --homepage "https://example.com" \ 2024 --display-name "User Name" 2025 2026# Add additional feed to existing user 2027thicket add feed "username" "https://example.com/other-feed.xml" 2028``` 2029 2030### Sync Feeds 2031```bash 2032# Sync all users 2033thicket sync --all 2034 2035# Sync specific user 2036thicket sync --user "username" 2037 2038# Dry run (preview changes) 2039thicket sync --all --dry-run 2040``` 2041 2042### List Information 2043```bash 2044# List all users 2045thicket list users 2046 2047# List all feeds 2048thicket list feeds 2049 2050# List feeds for specific user 2051thicket list feeds --user "username" 2052 2053# List recent entries 2054thicket list entries --limit 20 2055 2056# List entries for specific user 2057thicket list entries --user "username" 2058``` 2059 2060### Manage Duplicates 2061```bash 2062# List duplicate mappings 2063thicket duplicates list 2064 2065# Mark entries as duplicates 2066thicket duplicates add "https://example.com/dup" "https://example.com/canonical" 2067 2068# Remove duplicate mapping 2069thicket duplicates remove "https://example.com/dup" 2070``` 2071 2072## Configuration 2073 2074Thicket uses a YAML configuration file (default: `thicket.yaml`): 2075 2076```yaml 2077git_store: ./feeds-repo 2078cache_dir: ~/.cache/thicket 2079users: 2080 - username: alice 2081 feeds: 2082 - https://alice.example.com/feed.xml 2083 email: alice@example.com 2084 homepage: https://alice.example.com 2085 display_name: Alice 2086``` 2087 2088## Git Repository Structure 2089 2090``` 2091feeds-repo/ 2092├── index.json # User directory index 2093├── duplicates.json # Duplicate entry mappings 2094├── alice/ 2095│ ├── metadata.json # User metadata 2096│ ├── entry_id_1.json # Feed entries 2097│ └── entry_id_2.json 2098└── bob/ 2099 └── ... 2100``` 2101 2102## Development 2103 2104### Setup 2105```bash 2106# Install in development mode 2107pip install -e .[dev] 2108 2109# Run tests 2110pytest 2111 2112# Run linting 2113ruff check src/ 2114black --check src/ 2115 2116# Run type checking 2117mypy src/ 2118``` 2119 2120### Architecture 2121 2122- **CLI**: Modern interface with Typer and Rich 2123- **Feed Processing**: Universal parsing with feedparser 2124- **Git Storage**: Structured storage with GitPython 2125- **Data Models**: Pydantic for validation and serialization 2126- **Async HTTP**: httpx for efficient feed fetching 2127 2128## Use Cases 2129 2130- **Blog Aggregation**: Collect and archive blog posts from multiple sources 2131- **Comment Networks**: Enable distributed commenting systems 2132- **Feed Archival**: Preserve feed history beyond typical feed depth limits 2133- **Content Curation**: Manage and deduplicate content across feeds 2134 2135## License 2136 2137MIT License - see LICENSE file for details. 2138</file> 2139 2140<file path="src/thicket/cli/commands/index_cmd.py"> 2141"""CLI command for building reference index from blog entries.""" 2142 2143import json 2144from pathlib import Path 2145from typing import Optional 2146 2147import typer 2148from rich.console import Console 2149from rich.progress import ( 2150 BarColumn, 2151 Progress, 2152 SpinnerColumn, 2153 TaskProgressColumn, 2154 TextColumn, 2155) 2156from rich.table import Table 2157 2158from ...core.git_store import GitStore 2159from ...core.reference_parser import ReferenceIndex, ReferenceParser 2160from ..main import app 2161from ..utils import get_tsv_mode, load_config 2162 2163console = Console() 2164 2165 2166@app.command() 2167def index( 2168 config_file: Optional[Path] = typer.Option( 2169 None, 2170 "--config", 2171 "-c", 2172 help="Path to configuration file", 2173 ), 2174 output_file: Optional[Path] = typer.Option( 2175 None, 2176 "--output", 2177 "-o", 2178 help="Path to output index file (default: updates links.json in git store)", 2179 ), 2180 verbose: bool = typer.Option( 2181 False, 2182 "--verbose", 2183 "-v", 2184 help="Show detailed progress information", 2185 ), 2186) -> None: 2187 """Build a reference index showing which blog entries reference others. 2188 2189 This command analyzes all blog entries to detect cross-references between 2190 different blogs, creating an index that can be used to build threaded 2191 views of related content. 2192 2193 Updates the unified links.json file with reference data. 2194 """ 2195 try: 2196 # Load configuration 2197 config = load_config(config_file) 2198 2199 # Initialize Git store 2200 git_store = GitStore(config.git_store) 2201 2202 # Initialize reference parser 2203 parser = ReferenceParser() 2204 2205 # Build user domain mapping 2206 if verbose: 2207 console.print("Building user domain mapping...") 2208 user_domains = parser.build_user_domain_mapping(git_store) 2209 2210 if verbose: 2211 console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains") 2212 2213 # Initialize reference index 2214 ref_index = ReferenceIndex() 2215 ref_index.user_domains = user_domains 2216 2217 # Get all users 2218 index = git_store._load_index() 2219 users = list(index.users.keys()) 2220 2221 if not users: 2222 console.print("[yellow]No users found in Git store[/yellow]") 2223 raise typer.Exit(0) 2224 2225 # Process all entries 2226 total_entries = 0 2227 total_references = 0 2228 all_references = [] 2229 2230 with Progress( 2231 SpinnerColumn(), 2232 TextColumn("[progress.description]{task.description}"), 2233 BarColumn(), 2234 TaskProgressColumn(), 2235 console=console, 2236 ) as progress: 2237 2238 # Count total entries first 2239 counting_task = progress.add_task("Counting entries...", total=len(users)) 2240 entry_counts = {} 2241 for username in users: 2242 entries = git_store.list_entries(username) 2243 entry_counts[username] = len(entries) 2244 total_entries += len(entries) 2245 progress.advance(counting_task) 2246 2247 progress.remove_task(counting_task) 2248 2249 # Process entries - extract references 2250 processing_task = progress.add_task( 2251 f"Extracting references from {total_entries} entries...", 2252 total=total_entries 2253 ) 2254 2255 for username in users: 2256 entries = git_store.list_entries(username) 2257 2258 for entry in entries: 2259 # Extract references from this entry 2260 references = parser.extract_references(entry, username, user_domains) 2261 all_references.extend(references) 2262 2263 progress.advance(processing_task) 2264 2265 if verbose and references: 2266 console.print(f" Found {len(references)} references in {username}:{entry.title[:50]}...") 2267 2268 progress.remove_task(processing_task) 2269 2270 # Resolve target_entry_ids for references 2271 if all_references: 2272 resolve_task = progress.add_task( 2273 f"Resolving {len(all_references)} references...", 2274 total=len(all_references) 2275 ) 2276 2277 if verbose: 2278 console.print(f"Resolving target entry IDs for {len(all_references)} references...") 2279 2280 resolved_references = parser.resolve_target_entry_ids(all_references, git_store) 2281 2282 # Count resolved references 2283 resolved_count = sum(1 for ref in resolved_references if ref.target_entry_id is not None) 2284 if verbose: 2285 console.print(f"Resolved {resolved_count} out of {len(all_references)} references") 2286 2287 # Add resolved references to index 2288 for ref in resolved_references: 2289 ref_index.add_reference(ref) 2290 total_references += 1 2291 progress.advance(resolve_task) 2292 2293 progress.remove_task(resolve_task) 2294 2295 # Determine output path 2296 if output_file: 2297 output_path = output_file 2298 else: 2299 output_path = config.git_store / "links.json" 2300 2301 # Load existing links data or create new structure 2302 if output_path.exists() and not output_file: 2303 # Load existing unified structure 2304 with open(output_path) as f: 2305 existing_data = json.load(f) 2306 else: 2307 # Create new structure 2308 existing_data = { 2309 "links": {}, 2310 "reverse_mapping": {}, 2311 "user_domains": {} 2312 } 2313 2314 # Update with reference data 2315 existing_data["references"] = ref_index.to_dict()["references"] 2316 existing_data["user_domains"] = {k: list(v) for k, v in user_domains.items()} 2317 2318 # Save updated structure 2319 with open(output_path, "w") as f: 2320 json.dump(existing_data, f, indent=2, default=str) 2321 2322 # Show summary 2323 if not get_tsv_mode(): 2324 console.print("\n[green]✓ Reference index built successfully[/green]") 2325 2326 # Create summary table or TSV output 2327 if get_tsv_mode(): 2328 print("Metric\tCount") 2329 print(f"Total Users\t{len(users)}") 2330 print(f"Total Entries\t{total_entries}") 2331 print(f"Total References\t{total_references}") 2332 print(f"Outbound Refs\t{len(ref_index.outbound_refs)}") 2333 print(f"Inbound Refs\t{len(ref_index.inbound_refs)}") 2334 print(f"Output File\t{output_path}") 2335 else: 2336 table = Table(title="Reference Index Summary") 2337 table.add_column("Metric", style="cyan") 2338 table.add_column("Count", style="green") 2339 2340 table.add_row("Total Users", str(len(users))) 2341 table.add_row("Total Entries", str(total_entries)) 2342 table.add_row("Total References", str(total_references)) 2343 table.add_row("Outbound Refs", str(len(ref_index.outbound_refs))) 2344 table.add_row("Inbound Refs", str(len(ref_index.inbound_refs))) 2345 table.add_row("Output File", str(output_path)) 2346 2347 console.print(table) 2348 2349 # Show some interesting statistics 2350 if total_references > 0: 2351 if not get_tsv_mode(): 2352 console.print("\n[bold]Reference Statistics:[/bold]") 2353 2354 # Most referenced users 2355 target_counts = {} 2356 unresolved_domains = set() 2357 2358 for ref in ref_index.references: 2359 if ref.target_username: 2360 target_counts[ref.target_username] = target_counts.get(ref.target_username, 0) + 1 2361 else: 2362 # Track unresolved domains 2363 from urllib.parse import urlparse 2364 domain = urlparse(ref.target_url).netloc.lower() 2365 unresolved_domains.add(domain) 2366 2367 if target_counts: 2368 if get_tsv_mode(): 2369 print("Referenced User\tReference Count") 2370 for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]: 2371 print(f"{username}\t{count}") 2372 else: 2373 console.print("\nMost referenced users:") 2374 for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]: 2375 console.print(f" {username}: {count} references") 2376 2377 if unresolved_domains and verbose: 2378 if get_tsv_mode(): 2379 print("Unresolved Domain\tCount") 2380 for domain in sorted(list(unresolved_domains)[:10]): 2381 print(f"{domain}\t1") 2382 if len(unresolved_domains) > 10: 2383 print(f"... and {len(unresolved_domains) - 10} more\t...") 2384 else: 2385 console.print(f"\nUnresolved domains: {len(unresolved_domains)}") 2386 for domain in sorted(list(unresolved_domains)[:10]): 2387 console.print(f" {domain}") 2388 if len(unresolved_domains) > 10: 2389 console.print(f" ... and {len(unresolved_domains) - 10} more") 2390 2391 except Exception as e: 2392 console.print(f"[red]Error building reference index: {e}[/red]") 2393 if verbose: 2394 console.print_exception() 2395 raise typer.Exit(1) 2396 2397 2398@app.command() 2399def threads( 2400 config_file: Optional[Path] = typer.Option( 2401 None, 2402 "--config", 2403 "-c", 2404 help="Path to configuration file", 2405 ), 2406 index_file: Optional[Path] = typer.Option( 2407 None, 2408 "--index", 2409 "-i", 2410 help="Path to reference index file (default: links.json in git store)", 2411 ), 2412 username: Optional[str] = typer.Option( 2413 None, 2414 "--username", 2415 "-u", 2416 help="Show threads for specific username only", 2417 ), 2418 entry_id: Optional[str] = typer.Option( 2419 None, 2420 "--entry", 2421 "-e", 2422 help="Show thread for specific entry ID", 2423 ), 2424 min_size: int = typer.Option( 2425 2, 2426 "--min-size", 2427 "-m", 2428 help="Minimum thread size to display", 2429 ), 2430) -> None: 2431 """Show threaded view of related blog entries. 2432 2433 This command uses the reference index to show which blog entries 2434 are connected through cross-references, creating an email-style 2435 threaded view of the conversation. 2436 2437 Reads reference data from the unified links.json file. 2438 """ 2439 try: 2440 # Load configuration 2441 config = load_config(config_file) 2442 2443 # Determine index file path 2444 if index_file: 2445 index_path = index_file 2446 else: 2447 index_path = config.git_store / "links.json" 2448 2449 if not index_path.exists(): 2450 console.print(f"[red]Links file not found: {index_path}[/red]") 2451 console.print("Run 'thicket links' and 'thicket index' first to build the reference index") 2452 raise typer.Exit(1) 2453 2454 # Load unified data 2455 with open(index_path) as f: 2456 unified_data = json.load(f) 2457 2458 # Check if references exist in the unified structure 2459 if "references" not in unified_data: 2460 console.print(f"[red]No references found in {index_path}[/red]") 2461 console.print("Run 'thicket index' first to build the reference index") 2462 raise typer.Exit(1) 2463 2464 # Extract reference data and reconstruct ReferenceIndex 2465 ref_index = ReferenceIndex.from_dict({ 2466 "references": unified_data["references"], 2467 "user_domains": unified_data.get("user_domains", {}) 2468 }) 2469 2470 # Initialize Git store to get entry details 2471 git_store = GitStore(config.git_store) 2472 2473 if entry_id and username: 2474 # Show specific thread 2475 thread_members = ref_index.get_thread_members(username, entry_id) 2476 _display_thread(thread_members, ref_index, git_store, f"Thread for {username}:{entry_id}") 2477 2478 elif username: 2479 # Show all threads involving this user 2480 user_index = git_store._load_index() 2481 user = user_index.get_user(username) 2482 if not user: 2483 console.print(f"[red]User not found: {username}[/red]") 2484 raise typer.Exit(1) 2485 2486 entries = git_store.list_entries(username) 2487 threads_found = set() 2488 2489 console.print(f"[bold]Threads involving {username}:[/bold]\n") 2490 2491 for entry in entries: 2492 thread_members = ref_index.get_thread_members(username, entry.id) 2493 if len(thread_members) >= min_size: 2494 thread_key = tuple(sorted(thread_members)) 2495 if thread_key not in threads_found: 2496 threads_found.add(thread_key) 2497 _display_thread(thread_members, ref_index, git_store, f"Thread #{len(threads_found)}") 2498 2499 else: 2500 # Show all threads 2501 console.print("[bold]All conversation threads:[/bold]\n") 2502 2503 all_threads = set() 2504 processed_entries = set() 2505 2506 # Get all entries 2507 user_index = git_store._load_index() 2508 for username in user_index.users.keys(): 2509 entries = git_store.list_entries(username) 2510 for entry in entries: 2511 entry_key = (username, entry.id) 2512 if entry_key in processed_entries: 2513 continue 2514 2515 thread_members = ref_index.get_thread_members(username, entry.id) 2516 if len(thread_members) >= min_size: 2517 thread_key = tuple(sorted(thread_members)) 2518 if thread_key not in all_threads: 2519 all_threads.add(thread_key) 2520 _display_thread(thread_members, ref_index, git_store, f"Thread #{len(all_threads)}") 2521 2522 # Mark all members as processed 2523 for member in thread_members: 2524 processed_entries.add(member) 2525 2526 if not all_threads: 2527 console.print("[yellow]No conversation threads found[/yellow]") 2528 console.print(f"(minimum thread size: {min_size})") 2529 2530 except Exception as e: 2531 console.print(f"[red]Error showing threads: {e}[/red]") 2532 raise typer.Exit(1) 2533 2534 2535def _display_thread(thread_members, ref_index, git_store, title): 2536 """Display a single conversation thread.""" 2537 console.print(f"[bold cyan]{title}[/bold cyan]") 2538 console.print(f"Thread size: {len(thread_members)} entries") 2539 2540 # Get entry details for each member 2541 thread_entries = [] 2542 for username, entry_id in thread_members: 2543 entry = git_store.get_entry(username, entry_id) 2544 if entry: 2545 thread_entries.append((username, entry)) 2546 2547 # Sort by publication date 2548 thread_entries.sort(key=lambda x: x[1].published or x[1].updated) 2549 2550 # Display entries 2551 for i, (username, entry) in enumerate(thread_entries): 2552 prefix = "├─" if i < len(thread_entries) - 1 else "" 2553 2554 # Get references for this entry 2555 outbound = ref_index.get_outbound_refs(username, entry.id) 2556 inbound = ref_index.get_inbound_refs(username, entry.id) 2557 2558 ref_info = "" 2559 if outbound or inbound: 2560 ref_info = f" ({len(outbound)} out, {len(inbound)} in)" 2561 2562 console.print(f" {prefix} [{username}] {entry.title[:60]}...{ref_info}") 2563 2564 if entry.published: 2565 console.print(f" Published: {entry.published.strftime('%Y-%m-%d')}") 2566 2567 console.print() # Empty line after each thread 2568</file> 2569 2570<file path="src/thicket/cli/commands/info_cmd.py"> 2571"""CLI command for displaying detailed information about a specific atom entry.""" 2572 2573import json 2574from pathlib import Path 2575from typing import Optional 2576 2577import typer 2578from rich.console import Console 2579from rich.panel import Panel 2580from rich.table import Table 2581from rich.text import Text 2582 2583from ...core.git_store import GitStore 2584from ...core.reference_parser import ReferenceIndex 2585from ..main import app 2586from ..utils import load_config, get_tsv_mode 2587 2588console = Console() 2589 2590 2591@app.command() 2592def info( 2593 identifier: str = typer.Argument( 2594 ..., 2595 help="The atom ID or URL of the entry to display information about" 2596 ), 2597 username: Optional[str] = typer.Option( 2598 None, 2599 "--username", 2600 "-u", 2601 help="Username to search for the entry (if not provided, searches all users)" 2602 ), 2603 config_file: Optional[Path] = typer.Option( 2604 Path("thicket.yaml"), 2605 "--config", 2606 "-c", 2607 help="Path to configuration file", 2608 ), 2609 show_content: bool = typer.Option( 2610 False, 2611 "--content", 2612 help="Include the full content of the entry in the output" 2613 ), 2614) -> None: 2615 """Display detailed information about a specific atom entry. 2616 2617 You can specify the entry using either its atom ID or URL. 2618 Shows all metadata for the given entry, including title, dates, categories, 2619 and summarizes all inbound and outbound links to/from other posts. 2620 """ 2621 try: 2622 # Load configuration 2623 config = load_config(config_file) 2624 2625 # Initialize Git store 2626 git_store = GitStore(config.git_store) 2627 2628 # Find the entry 2629 entry = None 2630 found_username = None 2631 2632 # Check if identifier looks like a URL 2633 is_url = identifier.startswith(('http://', 'https://')) 2634 2635 if username: 2636 # Search specific username 2637 if is_url: 2638 # Search by URL 2639 entries = git_store.list_entries(username) 2640 for e in entries: 2641 if str(e.link) == identifier: 2642 entry = e 2643 found_username = username 2644 break 2645 else: 2646 # Search by atom ID 2647 entry = git_store.get_entry(username, identifier) 2648 if entry: 2649 found_username = username 2650 else: 2651 # Search all users 2652 index = git_store._load_index() 2653 for user in index.users.keys(): 2654 if is_url: 2655 # Search by URL 2656 entries = git_store.list_entries(user) 2657 for e in entries: 2658 if str(e.link) == identifier: 2659 entry = e 2660 found_username = user 2661 break 2662 if entry: 2663 break 2664 else: 2665 # Search by atom ID 2666 entry = git_store.get_entry(user, identifier) 2667 if entry: 2668 found_username = user 2669 break 2670 2671 if not entry or not found_username: 2672 if username: 2673 console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found for user '{username}'[/red]") 2674 else: 2675 console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found in any user's entries[/red]") 2676 raise typer.Exit(1) 2677 2678 # Load reference index if available 2679 links_path = config.git_store / "links.json" 2680 ref_index = None 2681 if links_path.exists(): 2682 with open(links_path) as f: 2683 unified_data = json.load(f) 2684 2685 # Check if references exist in the unified structure 2686 if "references" in unified_data: 2687 ref_index = ReferenceIndex.from_dict({ 2688 "references": unified_data["references"], 2689 "user_domains": unified_data.get("user_domains", {}) 2690 }) 2691 2692 # Display information 2693 if get_tsv_mode(): 2694 _display_entry_info_tsv(entry, found_username, ref_index, show_content) 2695 else: 2696 _display_entry_info(entry, found_username) 2697 2698 if ref_index: 2699 _display_link_info(entry, found_username, ref_index) 2700 else: 2701 console.print("\n[yellow]No reference index found. Run 'thicket links' and 'thicket index' to build cross-reference data.[/yellow]") 2702 2703 # Optionally display content 2704 if show_content and entry.content: 2705 _display_content(entry.content) 2706 2707 except Exception as e: 2708 console.print(f"[red]Error displaying entry info: {e}[/red]") 2709 raise typer.Exit(1) 2710 2711 2712def _display_entry_info(entry, username: str) -> None: 2713 """Display basic entry information in a structured format.""" 2714 2715 # Create main info panel 2716 info_table = Table.grid(padding=(0, 2)) 2717 info_table.add_column("Field", style="cyan bold", width=15) 2718 info_table.add_column("Value", style="white") 2719 2720 info_table.add_row("User", f"[green]{username}[/green]") 2721 info_table.add_row("Atom ID", f"[blue]{entry.id}[/blue]") 2722 info_table.add_row("Title", entry.title) 2723 info_table.add_row("Link", str(entry.link)) 2724 2725 if entry.published: 2726 info_table.add_row("Published", entry.published.strftime("%Y-%m-%d %H:%M:%S UTC")) 2727 2728 info_table.add_row("Updated", entry.updated.strftime("%Y-%m-%d %H:%M:%S UTC")) 2729 2730 if entry.summary: 2731 # Truncate long summaries 2732 summary = entry.summary[:200] + "..." if len(entry.summary) > 200 else entry.summary 2733 info_table.add_row("Summary", summary) 2734 2735 if entry.categories: 2736 categories_text = ", ".join(entry.categories) 2737 info_table.add_row("Categories", categories_text) 2738 2739 if entry.author: 2740 author_info = [] 2741 if "name" in entry.author: 2742 author_info.append(entry.author["name"]) 2743 if "email" in entry.author: 2744 author_info.append(f"<{entry.author['email']}>") 2745 if author_info: 2746 info_table.add_row("Author", " ".join(author_info)) 2747 2748 if entry.content_type: 2749 info_table.add_row("Content Type", entry.content_type) 2750 2751 if entry.rights: 2752 info_table.add_row("Rights", entry.rights) 2753 2754 if entry.source: 2755 info_table.add_row("Source Feed", entry.source) 2756 2757 panel = Panel( 2758 info_table, 2759 title=f"[bold]Entry Information[/bold]", 2760 border_style="blue" 2761 ) 2762 2763 console.print(panel) 2764 2765 2766def _display_link_info(entry, username: str, ref_index: ReferenceIndex) -> None: 2767 """Display inbound and outbound link information.""" 2768 2769 # Get links 2770 outbound_refs = ref_index.get_outbound_refs(username, entry.id) 2771 inbound_refs = ref_index.get_inbound_refs(username, entry.id) 2772 2773 if not outbound_refs and not inbound_refs: 2774 console.print("\n[dim]No cross-references found for this entry.[/dim]") 2775 return 2776 2777 # Create links table 2778 links_table = Table(title="Cross-References") 2779 links_table.add_column("Direction", style="cyan", width=10) 2780 links_table.add_column("Target/Source", style="green", width=20) 2781 links_table.add_column("URL", style="blue", width=50) 2782 2783 # Add outbound references 2784 for ref in outbound_refs: 2785 target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External" 2786 links_table.add_row("→ Out", target_info, ref.target_url) 2787 2788 # Add inbound references 2789 for ref in inbound_refs: 2790 source_info = f"{ref.source_username}:{ref.source_entry_id}" 2791 links_table.add_row("← In", source_info, ref.target_url) 2792 2793 console.print() 2794 console.print(links_table) 2795 2796 # Summary 2797 console.print(f"\n[bold]Summary:[/bold] {len(outbound_refs)} outbound, {len(inbound_refs)} inbound references") 2798 2799 2800def _display_content(content: str) -> None: 2801 """Display the full content of the entry.""" 2802 2803 # Truncate very long content 2804 display_content = content 2805 if len(content) > 5000: 2806 display_content = content[:5000] + "\n\n[... content truncated ...]" 2807 2808 panel = Panel( 2809 display_content, 2810 title="[bold]Entry Content[/bold]", 2811 border_style="green", 2812 expand=False 2813 ) 2814 2815 console.print() 2816 console.print(panel) 2817 2818 2819def _display_entry_info_tsv(entry, username: str, ref_index: Optional[ReferenceIndex], show_content: bool) -> None: 2820 """Display entry information in TSV format.""" 2821 2822 # Basic info 2823 print("Field\tValue") 2824 print(f"User\t{username}") 2825 print(f"Atom ID\t{entry.id}") 2826 print(f"Title\t{entry.title.replace(chr(9), ' ').replace(chr(10), ' ').replace(chr(13), ' ')}") 2827 print(f"Link\t{entry.link}") 2828 2829 if entry.published: 2830 print(f"Published\t{entry.published.strftime('%Y-%m-%d %H:%M:%S UTC')}") 2831 2832 print(f"Updated\t{entry.updated.strftime('%Y-%m-%d %H:%M:%S UTC')}") 2833 2834 if entry.summary: 2835 # Escape tabs and newlines in summary 2836 summary = entry.summary.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') 2837 print(f"Summary\t{summary}") 2838 2839 if entry.categories: 2840 print(f"Categories\t{', '.join(entry.categories)}") 2841 2842 if entry.author: 2843 author_info = [] 2844 if "name" in entry.author: 2845 author_info.append(entry.author["name"]) 2846 if "email" in entry.author: 2847 author_info.append(f"<{entry.author['email']}>") 2848 if author_info: 2849 print(f"Author\t{' '.join(author_info)}") 2850 2851 if entry.content_type: 2852 print(f"Content Type\t{entry.content_type}") 2853 2854 if entry.rights: 2855 print(f"Rights\t{entry.rights}") 2856 2857 if entry.source: 2858 print(f"Source Feed\t{entry.source}") 2859 2860 # Add reference info if available 2861 if ref_index: 2862 outbound_refs = ref_index.get_outbound_refs(username, entry.id) 2863 inbound_refs = ref_index.get_inbound_refs(username, entry.id) 2864 2865 print(f"Outbound References\t{len(outbound_refs)}") 2866 print(f"Inbound References\t{len(inbound_refs)}") 2867 2868 # Show each reference 2869 for ref in outbound_refs: 2870 target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External" 2871 print(f"Outbound Reference\t{target_info}\t{ref.target_url}") 2872 2873 for ref in inbound_refs: 2874 source_info = f"{ref.source_username}:{ref.source_entry_id}" 2875 print(f"Inbound Reference\t{source_info}\t{ref.target_url}") 2876 2877 # Show content if requested 2878 if show_content and entry.content: 2879 # Escape tabs and newlines in content 2880 content = entry.content.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') 2881 print(f"Content\t{content}") 2882</file> 2883 2884<file path="src/thicket/cli/commands/init.py"> 2885"""Initialize command for thicket.""" 2886 2887from pathlib import Path 2888from typing import Optional 2889 2890import typer 2891from pydantic import ValidationError 2892 2893from ...core.git_store import GitStore 2894from ...models import ThicketConfig 2895from ..main import app 2896from ..utils import print_error, print_success, save_config 2897 2898 2899@app.command() 2900def init( 2901 git_store: Path = typer.Argument(..., help="Path to Git repository for storing feeds"), 2902 cache_dir: Optional[Path] = typer.Option( 2903 None, "--cache-dir", "-c", help="Cache directory (default: ~/.cache/thicket)" 2904 ), 2905 config_file: Optional[Path] = typer.Option( 2906 None, "--config", help="Configuration file path (default: thicket.yaml)" 2907 ), 2908 force: bool = typer.Option( 2909 False, "--force", "-f", help="Overwrite existing configuration" 2910 ), 2911) -> None: 2912 """Initialize a new thicket configuration and Git store.""" 2913 2914 # Set default paths 2915 if cache_dir is None: 2916 from platformdirs import user_cache_dir 2917 cache_dir = Path(user_cache_dir("thicket")) 2918 2919 if config_file is None: 2920 config_file = Path("thicket.yaml") 2921 2922 # Check if config already exists 2923 if config_file.exists() and not force: 2924 print_error(f"Configuration file already exists: {config_file}") 2925 print_error("Use --force to overwrite") 2926 raise typer.Exit(1) 2927 2928 # Create cache directory 2929 cache_dir.mkdir(parents=True, exist_ok=True) 2930 2931 # Create Git store 2932 try: 2933 GitStore(git_store) 2934 print_success(f"Initialized Git store at: {git_store}") 2935 except Exception as e: 2936 print_error(f"Failed to initialize Git store: {e}") 2937 raise typer.Exit(1) from e 2938 2939 # Create configuration 2940 try: 2941 config = ThicketConfig( 2942 git_store=git_store, 2943 cache_dir=cache_dir, 2944 users=[] 2945 ) 2946 2947 save_config(config, config_file) 2948 print_success(f"Created configuration file: {config_file}") 2949 2950 except ValidationError as e: 2951 print_error(f"Invalid configuration: {e}") 2952 raise typer.Exit(1) from e 2953 except Exception as e: 2954 print_error(f"Failed to create configuration: {e}") 2955 raise typer.Exit(1) from e 2956 2957 print_success("Thicket initialized successfully!") 2958 print_success(f"Git store: {git_store}") 2959 print_success(f"Cache directory: {cache_dir}") 2960 print_success(f"Configuration: {config_file}") 2961 print_success("Run 'thicket add user' to add your first user and feed.") 2962</file> 2963 2964<file path="src/thicket/cli/__init__.py"> 2965"""CLI interface for thicket.""" 2966 2967from .main import app 2968 2969__all__ = ["app"] 2970</file> 2971 2972<file path="src/thicket/core/__init__.py"> 2973"""Core business logic for thicket.""" 2974 2975from .feed_parser import FeedParser 2976from .git_store import GitStore 2977 2978__all__ = ["FeedParser", "GitStore"] 2979</file> 2980 2981<file path="src/thicket/core/feed_parser.py"> 2982"""Feed parsing and normalization with auto-discovery.""" 2983 2984from datetime import datetime 2985from typing import Optional 2986from urllib.parse import urlparse 2987 2988import bleach 2989import feedparser 2990import httpx 2991from pydantic import HttpUrl, ValidationError 2992 2993from ..models import AtomEntry, FeedMetadata 2994 2995 2996class FeedParser: 2997 """Parser for RSS/Atom feeds with normalization and auto-discovery.""" 2998 2999 def __init__(self, user_agent: str = "thicket/0.1.0"): 3000 """Initialize the feed parser.""" 3001 self.user_agent = user_agent 3002 self.allowed_tags = [ 3003 "a", "abbr", "acronym", "b", "blockquote", "br", "code", "em", 3004 "i", "li", "ol", "p", "pre", "strong", "ul", "h1", "h2", "h3", 3005 "h4", "h5", "h6", "img", "div", "span", 3006 ] 3007 self.allowed_attributes = { 3008 "a": ["href", "title"], 3009 "abbr": ["title"], 3010 "acronym": ["title"], 3011 "img": ["src", "alt", "title", "width", "height"], 3012 "blockquote": ["cite"], 3013 } 3014 3015 async def fetch_feed(self, url: HttpUrl) -> str: 3016 """Fetch feed content from URL.""" 3017 async with httpx.AsyncClient() as client: 3018 response = await client.get( 3019 str(url), 3020 headers={"User-Agent": self.user_agent}, 3021 timeout=30.0, 3022 follow_redirects=True, 3023 ) 3024 response.raise_for_status() 3025 return response.text 3026 3027 def parse_feed(self, content: str, source_url: Optional[HttpUrl] = None) -> tuple[FeedMetadata, list[AtomEntry]]: 3028 """Parse feed content and return metadata and entries.""" 3029 parsed = feedparser.parse(content) 3030 3031 if parsed.bozo and parsed.bozo_exception: 3032 # Try to continue with potentially malformed feed 3033 pass 3034 3035 # Extract feed metadata 3036 feed_meta = self._extract_feed_metadata(parsed.feed) 3037 3038 # Extract and normalize entries 3039 entries = [] 3040 for entry in parsed.entries: 3041 try: 3042 atom_entry = self._normalize_entry(entry, source_url) 3043 entries.append(atom_entry) 3044 except Exception as e: 3045 # Log error but continue processing other entries 3046 print(f"Error processing entry {getattr(entry, 'id', 'unknown')}: {e}") 3047 continue 3048 3049 return feed_meta, entries 3050 3051 def _extract_feed_metadata(self, feed: feedparser.FeedParserDict) -> FeedMetadata: 3052 """Extract metadata from feed for auto-discovery.""" 3053 # Parse author information 3054 author_name = None 3055 author_email = None 3056 author_uri = None 3057 3058 if hasattr(feed, 'author_detail'): 3059 author_name = feed.author_detail.get('name') 3060 author_email = feed.author_detail.get('email') 3061 author_uri = feed.author_detail.get('href') 3062 elif hasattr(feed, 'author'): 3063 author_name = feed.author 3064 3065 # Parse managing editor for RSS feeds 3066 if not author_email and hasattr(feed, 'managingEditor'): 3067 author_email = feed.managingEditor 3068 3069 # Parse feed link 3070 feed_link = None 3071 if hasattr(feed, 'link'): 3072 try: 3073 feed_link = HttpUrl(feed.link) 3074 except ValidationError: 3075 pass 3076 3077 # Parse image/icon/logo 3078 logo = None 3079 icon = None 3080 image_url = None 3081 3082 if hasattr(feed, 'image'): 3083 try: 3084 image_url = HttpUrl(feed.image.get('href', feed.image.get('url', ''))) 3085 except (ValidationError, AttributeError): 3086 pass 3087 3088 if hasattr(feed, 'icon'): 3089 try: 3090 icon = HttpUrl(feed.icon) 3091 except ValidationError: 3092 pass 3093 3094 if hasattr(feed, 'logo'): 3095 try: 3096 logo = HttpUrl(feed.logo) 3097 except ValidationError: 3098 pass 3099 3100 return FeedMetadata( 3101 title=getattr(feed, 'title', None), 3102 author_name=author_name, 3103 author_email=author_email, 3104 author_uri=HttpUrl(author_uri) if author_uri else None, 3105 link=feed_link, 3106 logo=logo, 3107 icon=icon, 3108 image_url=image_url, 3109 description=getattr(feed, 'description', None), 3110 ) 3111 3112 def _normalize_entry(self, entry: feedparser.FeedParserDict, source_url: Optional[HttpUrl] = None) -> AtomEntry: 3113 """Normalize an entry to Atom format.""" 3114 # Parse timestamps 3115 updated = self._parse_timestamp(entry.get('updated_parsed') or entry.get('published_parsed')) 3116 published = self._parse_timestamp(entry.get('published_parsed')) 3117 3118 # Parse content 3119 content = self._extract_content(entry) 3120 content_type = self._extract_content_type(entry) 3121 3122 # Parse author 3123 author = self._extract_author(entry) 3124 3125 # Parse categories/tags 3126 categories = [] 3127 if hasattr(entry, 'tags'): 3128 categories = [tag.get('term', '') for tag in entry.tags if tag.get('term')] 3129 3130 # Sanitize HTML content 3131 if content: 3132 content = self._sanitize_html(content) 3133 3134 summary = entry.get('summary', '') 3135 if summary: 3136 summary = self._sanitize_html(summary) 3137 3138 return AtomEntry( 3139 id=entry.get('id', entry.get('link', '')), 3140 title=entry.get('title', ''), 3141 link=HttpUrl(entry.get('link', '')), 3142 updated=updated, 3143 published=published, 3144 summary=summary or None, 3145 content=content or None, 3146 content_type=content_type, 3147 author=author, 3148 categories=categories, 3149 rights=entry.get('rights', None), 3150 source=str(source_url) if source_url else None, 3151 ) 3152 3153 def _parse_timestamp(self, time_struct) -> datetime: 3154 """Parse feedparser time struct to datetime.""" 3155 if time_struct: 3156 return datetime(*time_struct[:6]) 3157 return datetime.now() 3158 3159 def _extract_content(self, entry: feedparser.FeedParserDict) -> Optional[str]: 3160 """Extract the best content from an entry.""" 3161 # Prefer content over summary 3162 if hasattr(entry, 'content') and entry.content: 3163 # Find the best content (prefer text/html, then text/plain) 3164 for content_item in entry.content: 3165 if content_item.get('type') in ['text/html', 'html']: 3166 return content_item.get('value', '') 3167 elif content_item.get('type') in ['text/plain', 'text']: 3168 return content_item.get('value', '') 3169 # Fallback to first content item 3170 return entry.content[0].get('value', '') 3171 3172 # Fallback to summary 3173 return entry.get('summary', '') 3174 3175 def _extract_content_type(self, entry: feedparser.FeedParserDict) -> str: 3176 """Extract content type from entry.""" 3177 if hasattr(entry, 'content') and entry.content: 3178 content_type = entry.content[0].get('type', 'html') 3179 # Normalize content type 3180 if content_type in ['text/html', 'html']: 3181 return 'html' 3182 elif content_type in ['text/plain', 'text']: 3183 return 'text' 3184 elif content_type == 'xhtml': 3185 return 'xhtml' 3186 return 'html' 3187 3188 def _extract_author(self, entry: feedparser.FeedParserDict) -> Optional[dict]: 3189 """Extract author information from entry.""" 3190 author = {} 3191 3192 if hasattr(entry, 'author_detail'): 3193 author.update({ 3194 'name': entry.author_detail.get('name'), 3195 'email': entry.author_detail.get('email'), 3196 'uri': entry.author_detail.get('href'), 3197 }) 3198 elif hasattr(entry, 'author'): 3199 author['name'] = entry.author 3200 3201 return author if author else None 3202 3203 def _sanitize_html(self, html: str) -> str: 3204 """Sanitize HTML content to prevent XSS.""" 3205 return bleach.clean( 3206 html, 3207 tags=self.allowed_tags, 3208 attributes=self.allowed_attributes, 3209 strip=True, 3210 ) 3211 3212 def sanitize_entry_id(self, entry_id: str) -> str: 3213 """Sanitize entry ID to be a safe filename.""" 3214 # Parse URL to get meaningful parts 3215 parsed = urlparse(entry_id) 3216 3217 # Start with the path component 3218 if parsed.path: 3219 # Remove leading slash and replace problematic characters 3220 safe_id = parsed.path.lstrip('/').replace('/', '_').replace('\\', '_') 3221 else: 3222 # Use the entire ID as fallback 3223 safe_id = entry_id 3224 3225 # Replace problematic characters 3226 safe_chars = [] 3227 for char in safe_id: 3228 if char.isalnum() or char in '-_.': 3229 safe_chars.append(char) 3230 else: 3231 safe_chars.append('_') 3232 3233 safe_id = ''.join(safe_chars) 3234 3235 # Ensure it's not too long (max 200 chars) 3236 if len(safe_id) > 200: 3237 safe_id = safe_id[:200] 3238 3239 # Ensure it's not empty 3240 if not safe_id: 3241 safe_id = "entry" 3242 3243 return safe_id 3244</file> 3245 3246<file path="src/thicket/core/reference_parser.py"> 3247"""Reference detection and parsing for blog entries.""" 3248 3249import re 3250from typing import Optional 3251from urllib.parse import urlparse 3252 3253from ..models import AtomEntry 3254 3255 3256class BlogReference: 3257 """Represents a reference from one blog entry to another.""" 3258 3259 def __init__( 3260 self, 3261 source_entry_id: str, 3262 source_username: str, 3263 target_url: str, 3264 target_username: Optional[str] = None, 3265 target_entry_id: Optional[str] = None, 3266 ): 3267 self.source_entry_id = source_entry_id 3268 self.source_username = source_username 3269 self.target_url = target_url 3270 self.target_username = target_username 3271 self.target_entry_id = target_entry_id 3272 3273 def to_dict(self) -> dict: 3274 """Convert to dictionary for JSON serialization.""" 3275 result = { 3276 "source_entry_id": self.source_entry_id, 3277 "source_username": self.source_username, 3278 "target_url": self.target_url, 3279 } 3280 3281 # Only include optional fields if they are not None 3282 if self.target_username is not None: 3283 result["target_username"] = self.target_username 3284 if self.target_entry_id is not None: 3285 result["target_entry_id"] = self.target_entry_id 3286 3287 return result 3288 3289 @classmethod 3290 def from_dict(cls, data: dict) -> "BlogReference": 3291 """Create from dictionary.""" 3292 return cls( 3293 source_entry_id=data["source_entry_id"], 3294 source_username=data["source_username"], 3295 target_url=data["target_url"], 3296 target_username=data.get("target_username"), 3297 target_entry_id=data.get("target_entry_id"), 3298 ) 3299 3300 3301class ReferenceIndex: 3302 """Index of blog-to-blog references for creating threaded views.""" 3303 3304 def __init__(self): 3305 self.references: list[BlogReference] = [] 3306 self.outbound_refs: dict[ 3307 str, list[BlogReference] 3308 ] = {} # entry_id -> outbound refs 3309 self.inbound_refs: dict[ 3310 str, list[BlogReference] 3311 ] = {} # entry_id -> inbound refs 3312 self.user_domains: dict[str, set[str]] = {} # username -> set of domains 3313 3314 def add_reference(self, ref: BlogReference) -> None: 3315 """Add a reference to the index.""" 3316 self.references.append(ref) 3317 3318 # Update outbound references 3319 source_key = f"{ref.source_username}:{ref.source_entry_id}" 3320 if source_key not in self.outbound_refs: 3321 self.outbound_refs[source_key] = [] 3322 self.outbound_refs[source_key].append(ref) 3323 3324 # Update inbound references if we can identify the target 3325 if ref.target_username and ref.target_entry_id: 3326 target_key = f"{ref.target_username}:{ref.target_entry_id}" 3327 if target_key not in self.inbound_refs: 3328 self.inbound_refs[target_key] = [] 3329 self.inbound_refs[target_key].append(ref) 3330 3331 def get_outbound_refs(self, username: str, entry_id: str) -> list[BlogReference]: 3332 """Get all outbound references from an entry.""" 3333 key = f"{username}:{entry_id}" 3334 return self.outbound_refs.get(key, []) 3335 3336 def get_inbound_refs(self, username: str, entry_id: str) -> list[BlogReference]: 3337 """Get all inbound references to an entry.""" 3338 key = f"{username}:{entry_id}" 3339 return self.inbound_refs.get(key, []) 3340 3341 def get_thread_members(self, username: str, entry_id: str) -> set[tuple[str, str]]: 3342 """Get all entries that are part of the same thread.""" 3343 visited = set() 3344 to_visit = [(username, entry_id)] 3345 thread_members = set() 3346 3347 while to_visit: 3348 current_user, current_entry = to_visit.pop() 3349 if (current_user, current_entry) in visited: 3350 continue 3351 3352 visited.add((current_user, current_entry)) 3353 thread_members.add((current_user, current_entry)) 3354 3355 # Add outbound references 3356 for ref in self.get_outbound_refs(current_user, current_entry): 3357 if ref.target_username and ref.target_entry_id: 3358 to_visit.append((ref.target_username, ref.target_entry_id)) 3359 3360 # Add inbound references 3361 for ref in self.get_inbound_refs(current_user, current_entry): 3362 to_visit.append((ref.source_username, ref.source_entry_id)) 3363 3364 return thread_members 3365 3366 def to_dict(self) -> dict: 3367 """Convert to dictionary for JSON serialization.""" 3368 return { 3369 "references": [ref.to_dict() for ref in self.references], 3370 "user_domains": {k: list(v) for k, v in self.user_domains.items()}, 3371 } 3372 3373 @classmethod 3374 def from_dict(cls, data: dict) -> "ReferenceIndex": 3375 """Create from dictionary.""" 3376 index = cls() 3377 for ref_data in data.get("references", []): 3378 ref = BlogReference.from_dict(ref_data) 3379 index.add_reference(ref) 3380 3381 for username, domains in data.get("user_domains", {}).items(): 3382 index.user_domains[username] = set(domains) 3383 3384 return index 3385 3386 3387class ReferenceParser: 3388 """Parses blog entries to detect references to other blogs.""" 3389 3390 def __init__(self): 3391 # Common blog platforms and patterns 3392 self.blog_patterns = [ 3393 r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*", # Common blog domains 3394 r"https?://[^/]+\.github\.io/.*", # GitHub Pages 3395 r"https?://[^/]+\.substack\.com/.*", # Substack 3396 r"https?://medium\.com/.*", # Medium 3397 r"https?://[^/]+\.wordpress\.com/.*", # WordPress.com 3398 r"https?://[^/]+\.blogspot\.com/.*", # Blogger 3399 ] 3400 3401 # Compile regex patterns 3402 self.link_pattern = re.compile( 3403 r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL 3404 ) 3405 self.url_pattern = re.compile(r'https?://[^\s<>"]+') 3406 3407 def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]: 3408 """Extract all links from HTML content.""" 3409 links = [] 3410 3411 # Extract links from <a> tags 3412 for match in self.link_pattern.finditer(html_content): 3413 url = match.group(1) 3414 text = re.sub( 3415 r"<[^>]+>", "", match.group(2) 3416 ).strip() # Remove HTML tags from link text 3417 links.append((url, text)) 3418 3419 return links 3420 3421 def is_blog_url(self, url: str) -> bool: 3422 """Check if a URL likely points to a blog post.""" 3423 for pattern in self.blog_patterns: 3424 if re.match(pattern, url): 3425 return True 3426 return False 3427 3428 def _is_likely_blog_post_url(self, url: str) -> bool: 3429 """Check if a same-domain URL likely points to a blog post (not CSS, images, etc.).""" 3430 parsed_url = urlparse(url) 3431 path = parsed_url.path.lower() 3432 3433 # Skip obvious non-blog content 3434 if any(path.endswith(ext) for ext in ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.pdf', '.xml', '.json']): 3435 return False 3436 3437 # Skip common non-blog paths 3438 if any(segment in path for segment in ['/static/', '/assets/', '/css/', '/js/', '/images/', '/img/', '/media/', '/uploads/']): 3439 return False 3440 3441 # Skip fragment-only links (same page anchors) 3442 if not path or path == '/': 3443 return False 3444 3445 # Look for positive indicators of blog posts 3446 # Common blog post patterns: dates, slugs, post indicators 3447 blog_indicators = [ 3448 r'/\d{4}/', # Year in path 3449 r'/\d{4}/\d{2}/', # Year/month in path 3450 r'/blog/', 3451 r'/post/', 3452 r'/posts/', 3453 r'/articles?/', 3454 r'/notes?/', 3455 r'/entries/', 3456 r'/writing/', 3457 ] 3458 3459 for pattern in blog_indicators: 3460 if re.search(pattern, path): 3461 return True 3462 3463 # If it has a reasonable path depth and doesn't match exclusions, likely a blog post 3464 path_segments = [seg for seg in path.split('/') if seg] 3465 return len(path_segments) >= 1 # At least one meaningful path segment 3466 3467 def resolve_target_user( 3468 self, url: str, user_domains: dict[str, set[str]] 3469 ) -> Optional[str]: 3470 """Try to resolve a URL to a known user based on domain mapping.""" 3471 parsed_url = urlparse(url) 3472 domain = parsed_url.netloc.lower() 3473 3474 for username, domains in user_domains.items(): 3475 if domain in domains: 3476 return username 3477 3478 return None 3479 3480 def extract_references( 3481 self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]] 3482 ) -> list[BlogReference]: 3483 """Extract all blog references from an entry.""" 3484 references = [] 3485 3486 # Combine all text content for analysis 3487 content_to_search = [] 3488 if entry.content: 3489 content_to_search.append(entry.content) 3490 if entry.summary: 3491 content_to_search.append(entry.summary) 3492 3493 for content in content_to_search: 3494 links = self.extract_links_from_html(content) 3495 3496 for url, _link_text in links: 3497 entry_domain = ( 3498 urlparse(str(entry.link)).netloc.lower() if entry.link else "" 3499 ) 3500 link_domain = urlparse(url).netloc.lower() 3501 3502 # Check if this looks like a blog URL 3503 if not self.is_blog_url(url): 3504 continue 3505 3506 # For same-domain links, apply additional filtering to avoid non-blog content 3507 if link_domain == entry_domain: 3508 # Only include same-domain links that look like blog posts 3509 if not self._is_likely_blog_post_url(url): 3510 continue 3511 3512 # Try to resolve to a known user 3513 if link_domain == entry_domain: 3514 # Same domain - target user is the same as source user 3515 target_username: Optional[str] = username 3516 else: 3517 # Different domain - try to resolve 3518 target_username = self.resolve_target_user(url, user_domains) 3519 3520 ref = BlogReference( 3521 source_entry_id=entry.id, 3522 source_username=username, 3523 target_url=url, 3524 target_username=target_username, 3525 target_entry_id=None, # Will be resolved later if possible 3526 ) 3527 3528 references.append(ref) 3529 3530 return references 3531 3532 def build_user_domain_mapping(self, git_store: "GitStore") -> dict[str, set[str]]: 3533 """Build mapping of usernames to their known domains.""" 3534 user_domains = {} 3535 index = git_store._load_index() 3536 3537 for username, user_metadata in index.users.items(): 3538 domains = set() 3539 3540 # Add domains from feeds 3541 for feed_url in user_metadata.feeds: 3542 domain = urlparse(feed_url).netloc.lower() 3543 if domain: 3544 domains.add(domain) 3545 3546 # Add domain from homepage 3547 if user_metadata.homepage: 3548 domain = urlparse(str(user_metadata.homepage)).netloc.lower() 3549 if domain: 3550 domains.add(domain) 3551 3552 user_domains[username] = domains 3553 3554 return user_domains 3555 3556 def _build_url_to_entry_mapping(self, git_store: "GitStore") -> dict[str, str]: 3557 """Build a comprehensive mapping from URLs to entry IDs using git store data. 3558 3559 This creates a bidirectional mapping that handles: 3560 - Entry link URLs -> Entry IDs 3561 - URL variations (with/without www, http/https) 3562 - Multiple URLs pointing to the same entry 3563 """ 3564 url_to_entry: dict[str, str] = {} 3565 3566 # Load index to get all users 3567 index = git_store._load_index() 3568 3569 for username in index.users.keys(): 3570 entries = git_store.list_entries(username) 3571 3572 for entry in entries: 3573 if entry.link: 3574 link_url = str(entry.link) 3575 entry_id = entry.id 3576 3577 # Map the canonical link URL 3578 url_to_entry[link_url] = entry_id 3579 3580 # Handle common URL variations 3581 parsed = urlparse(link_url) 3582 if parsed.netloc and parsed.path: 3583 # Add version without www 3584 if parsed.netloc.startswith('www.'): 3585 no_www_url = f"{parsed.scheme}://{parsed.netloc[4:]}{parsed.path}" 3586 if parsed.query: 3587 no_www_url += f"?{parsed.query}" 3588 if parsed.fragment: 3589 no_www_url += f"#{parsed.fragment}" 3590 url_to_entry[no_www_url] = entry_id 3591 3592 # Add version with www if not present 3593 elif not parsed.netloc.startswith('www.'): 3594 www_url = f"{parsed.scheme}://www.{parsed.netloc}{parsed.path}" 3595 if parsed.query: 3596 www_url += f"?{parsed.query}" 3597 if parsed.fragment: 3598 www_url += f"#{parsed.fragment}" 3599 url_to_entry[www_url] = entry_id 3600 3601 # Add http/https variations 3602 if parsed.scheme == 'https': 3603 http_url = link_url.replace('https://', 'http://', 1) 3604 url_to_entry[http_url] = entry_id 3605 elif parsed.scheme == 'http': 3606 https_url = link_url.replace('http://', 'https://', 1) 3607 url_to_entry[https_url] = entry_id 3608 3609 return url_to_entry 3610 3611 def _normalize_url(self, url: str) -> str: 3612 """Normalize URL for consistent matching. 3613 3614 Handles common variations like trailing slashes, fragments, etc. 3615 """ 3616 parsed = urlparse(url) 3617 3618 # Remove trailing slash from path 3619 path = parsed.path.rstrip('/') if parsed.path != '/' else parsed.path 3620 3621 # Reconstruct without fragment for consistent matching 3622 normalized = f"{parsed.scheme}://{parsed.netloc}{path}" 3623 if parsed.query: 3624 normalized += f"?{parsed.query}" 3625 3626 return normalized 3627 3628 def resolve_target_entry_ids( 3629 self, references: list[BlogReference], git_store: "GitStore" 3630 ) -> list[BlogReference]: 3631 """Resolve target_entry_id for references using comprehensive URL mapping.""" 3632 resolved_refs = [] 3633 3634 # Build comprehensive URL to entry ID mapping 3635 url_to_entry = self._build_url_to_entry_mapping(git_store) 3636 3637 for ref in references: 3638 # If we already have a target_entry_id, keep the reference as-is 3639 if ref.target_entry_id is not None: 3640 resolved_refs.append(ref) 3641 continue 3642 3643 # If we don't have a target_username, we can't resolve it 3644 if ref.target_username is None: 3645 resolved_refs.append(ref) 3646 continue 3647 3648 # Try to resolve using URL mapping 3649 resolved_entry_id = None 3650 3651 # First, try exact match 3652 if ref.target_url in url_to_entry: 3653 resolved_entry_id = url_to_entry[ref.target_url] 3654 else: 3655 # Try normalized URL matching 3656 normalized_target = self._normalize_url(ref.target_url) 3657 if normalized_target in url_to_entry: 3658 resolved_entry_id = url_to_entry[normalized_target] 3659 else: 3660 # Try URL variations 3661 for mapped_url, entry_id in url_to_entry.items(): 3662 if self._normalize_url(mapped_url) == normalized_target: 3663 resolved_entry_id = entry_id 3664 break 3665 3666 # Verify the resolved entry belongs to the target username 3667 if resolved_entry_id: 3668 # Double-check by loading the actual entry 3669 entries = git_store.list_entries(ref.target_username) 3670 entry_found = any(entry.id == resolved_entry_id for entry in entries) 3671 if not entry_found: 3672 resolved_entry_id = None 3673 3674 # Create a new reference with the resolved target_entry_id 3675 resolved_ref = BlogReference( 3676 source_entry_id=ref.source_entry_id, 3677 source_username=ref.source_username, 3678 target_url=ref.target_url, 3679 target_username=ref.target_username, 3680 target_entry_id=resolved_entry_id, 3681 ) 3682 resolved_refs.append(resolved_ref) 3683 3684 return resolved_refs 3685</file> 3686 3687<file path="src/thicket/models/__init__.py"> 3688"""Data models for thicket.""" 3689 3690from .config import ThicketConfig, UserConfig 3691from .feed import AtomEntry, DuplicateMap, FeedMetadata 3692from .user import GitStoreIndex, UserMetadata 3693 3694__all__ = [ 3695 "ThicketConfig", 3696 "UserConfig", 3697 "AtomEntry", 3698 "DuplicateMap", 3699 "FeedMetadata", 3700 "GitStoreIndex", 3701 "UserMetadata", 3702] 3703</file> 3704 3705<file path="src/thicket/models/feed.py"> 3706"""Feed and entry models for thicket.""" 3707 3708from datetime import datetime 3709from typing import TYPE_CHECKING, Optional 3710 3711from pydantic import BaseModel, ConfigDict, EmailStr, HttpUrl 3712 3713if TYPE_CHECKING: 3714 from .config import UserConfig 3715 3716 3717class AtomEntry(BaseModel): 3718 """Represents an Atom feed entry stored in the Git repository.""" 3719 3720 model_config = ConfigDict( 3721 json_encoders={datetime: lambda v: v.isoformat()}, 3722 str_strip_whitespace=True, 3723 ) 3724 3725 id: str # Original Atom ID 3726 title: str 3727 link: HttpUrl 3728 updated: datetime 3729 published: Optional[datetime] = None 3730 summary: Optional[str] = None 3731 content: Optional[str] = None # Full body content from Atom entry 3732 content_type: Optional[str] = "html" # text, html, xhtml 3733 author: Optional[dict] = None 3734 categories: list[str] = [] 3735 rights: Optional[str] = None # Copyright info 3736 source: Optional[str] = None # Source feed URL 3737 3738 3739class FeedMetadata(BaseModel): 3740 """Metadata extracted from a feed for auto-discovery.""" 3741 3742 title: Optional[str] = None 3743 author_name: Optional[str] = None 3744 author_email: Optional[EmailStr] = None 3745 author_uri: Optional[HttpUrl] = None 3746 link: Optional[HttpUrl] = None 3747 logo: Optional[HttpUrl] = None 3748 icon: Optional[HttpUrl] = None 3749 image_url: Optional[HttpUrl] = None 3750 description: Optional[str] = None 3751 3752 def to_user_config(self, username: str, feed_url: HttpUrl) -> "UserConfig": 3753 """Convert discovered metadata to UserConfig with fallbacks.""" 3754 from .config import UserConfig 3755 3756 return UserConfig( 3757 username=username, 3758 feeds=[feed_url], 3759 display_name=self.author_name or self.title, 3760 email=self.author_email, 3761 homepage=self.author_uri or self.link, 3762 icon=self.logo or self.icon or self.image_url, 3763 ) 3764 3765 3766class DuplicateMap(BaseModel): 3767 """Maps duplicate entry IDs to canonical entry IDs.""" 3768 3769 duplicates: dict[str, str] = {} # duplicate_id -> canonical_id 3770 comment: str = "Entry IDs that map to the same canonical content" 3771 3772 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None: 3773 """Add a duplicate mapping.""" 3774 self.duplicates[duplicate_id] = canonical_id 3775 3776 def remove_duplicate(self, duplicate_id: str) -> bool: 3777 """Remove a duplicate mapping. Returns True if existed.""" 3778 return self.duplicates.pop(duplicate_id, None) is not None 3779 3780 def get_canonical(self, entry_id: str) -> str: 3781 """Get canonical ID for an entry (returns original if not duplicate).""" 3782 return self.duplicates.get(entry_id, entry_id) 3783 3784 def is_duplicate(self, entry_id: str) -> bool: 3785 """Check if entry ID is marked as duplicate.""" 3786 return entry_id in self.duplicates 3787 3788 def get_duplicates_for_canonical(self, canonical_id: str) -> list[str]: 3789 """Get all duplicate IDs that map to a canonical ID.""" 3790 return [ 3791 duplicate_id 3792 for duplicate_id, canonical in self.duplicates.items() 3793 if canonical == canonical_id 3794 ] 3795</file> 3796 3797<file path="src/thicket/models/user.py"> 3798"""User metadata models for thicket.""" 3799 3800from datetime import datetime 3801from typing import Optional 3802 3803from pydantic import BaseModel, ConfigDict 3804 3805 3806class UserMetadata(BaseModel): 3807 """Metadata about a user stored in the Git repository.""" 3808 3809 model_config = ConfigDict( 3810 json_encoders={datetime: lambda v: v.isoformat()}, 3811 str_strip_whitespace=True, 3812 ) 3813 3814 username: str 3815 display_name: Optional[str] = None 3816 email: Optional[str] = None 3817 homepage: Optional[str] = None 3818 icon: Optional[str] = None 3819 feeds: list[str] = [] 3820 directory: str # Directory name in Git store 3821 created: datetime 3822 last_updated: datetime 3823 entry_count: int = 0 3824 3825 def update_timestamp(self) -> None: 3826 """Update the last_updated timestamp to now.""" 3827 self.last_updated = datetime.now() 3828 3829 def increment_entry_count(self, count: int = 1) -> None: 3830 """Increment the entry count by the given amount.""" 3831 self.entry_count += count 3832 self.update_timestamp() 3833 3834 3835class GitStoreIndex(BaseModel): 3836 """Index of all users and their directories in the Git store.""" 3837 3838 model_config = ConfigDict( 3839 json_encoders={datetime: lambda v: v.isoformat()} 3840 ) 3841 3842 users: dict[str, UserMetadata] = {} # username -> UserMetadata 3843 created: datetime 3844 last_updated: datetime 3845 total_entries: int = 0 3846 3847 def add_user(self, user_metadata: UserMetadata) -> None: 3848 """Add or update a user in the index.""" 3849 self.users[user_metadata.username] = user_metadata 3850 self.last_updated = datetime.now() 3851 3852 def remove_user(self, username: str) -> bool: 3853 """Remove a user from the index. Returns True if user existed.""" 3854 if username in self.users: 3855 del self.users[username] 3856 self.last_updated = datetime.now() 3857 return True 3858 return False 3859 3860 def get_user(self, username: str) -> Optional[UserMetadata]: 3861 """Get user metadata by username.""" 3862 return self.users.get(username) 3863 3864 def update_entry_count(self, username: str, count: int) -> None: 3865 """Update entry count for a user and total.""" 3866 user = self.get_user(username) 3867 if user: 3868 user.increment_entry_count(count) 3869 self.total_entries += count 3870 self.last_updated = datetime.now() 3871 3872 def recalculate_totals(self) -> None: 3873 """Recalculate total entries from all users.""" 3874 self.total_entries = sum(user.entry_count for user in self.users.values()) 3875 self.last_updated = datetime.now() 3876</file> 3877 3878<file path="src/thicket/utils/__init__.py"> 3879"""Utility modules for thicket.""" 3880 3881# This module will contain shared utilities 3882# For now, it's empty but can be expanded with common functions 3883</file> 3884 3885<file path="src/thicket/__init__.py"> 3886"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories.""" 3887 3888__version__ = "0.1.0" 3889__author__ = "thicket" 3890__email__ = "thicket@example.com" 3891</file> 3892 3893<file path="src/thicket/__main__.py"> 3894"""Entry point for running thicket as a module.""" 3895 3896from .cli.main import app 3897 3898if __name__ == "__main__": 3899 app() 3900</file> 3901 3902<file path=".gitignore"> 3903# Byte-compiled / optimized / DLL files 3904__pycache__/ 3905*.py[codz] 3906*$py.class 3907 3908# C extensions 3909*.so 3910 3911# Distribution / packaging 3912.Python 3913build/ 3914develop-eggs/ 3915dist/ 3916downloads/ 3917eggs/ 3918.eggs/ 3919lib/ 3920lib64/ 3921parts/ 3922sdist/ 3923var/ 3924wheels/ 3925share/python-wheels/ 3926*.egg-info/ 3927.installed.cfg 3928*.egg 3929MANIFEST 3930 3931# PyInstaller 3932# Usually these files are written by a python script from a template 3933# before PyInstaller builds the exe, so as to inject date/other infos into it. 3934*.manifest 3935*.spec 3936 3937# Installer logs 3938pip-log.txt 3939pip-delete-this-directory.txt 3940 3941# Unit test / coverage reports 3942htmlcov/ 3943.tox/ 3944.nox/ 3945.coverage 3946.coverage.* 3947.cache 3948nosetests.xml 3949coverage.xml 3950*.cover 3951*.py.cover 3952.hypothesis/ 3953.pytest_cache/ 3954cover/ 3955 3956# Translations 3957*.mo 3958*.pot 3959 3960# Django stuff: 3961*.log 3962local_settings.py 3963db.sqlite3 3964db.sqlite3-journal 3965 3966# Flask stuff: 3967instance/ 3968.webassets-cache 3969 3970# Scrapy stuff: 3971.scrapy 3972 3973# Sphinx documentation 3974docs/_build/ 3975 3976# PyBuilder 3977.pybuilder/ 3978target/ 3979 3980# Jupyter Notebook 3981.ipynb_checkpoints 3982 3983# IPython 3984profile_default/ 3985ipython_config.py 3986 3987# pyenv 3988# For a library or package, you might want to ignore these files since the code is 3989# intended to run in multiple environments; otherwise, check them in: 3990# .python-version 3991 3992# pipenv 3993# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 3994# However, in case of collaboration, if having platform-specific dependencies or dependencies 3995# having no cross-platform support, pipenv may install dependencies that don't work, or not 3996# install all needed dependencies. 3997#Pipfile.lock 3998 3999# UV 4000# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 4001# This is especially recommended for binary packages to ensure reproducibility, and is more 4002# commonly ignored for libraries. 4003#uv.lock 4004 4005# poetry 4006# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 4007# This is especially recommended for binary packages to ensure reproducibility, and is more 4008# commonly ignored for libraries. 4009# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 4010#poetry.lock 4011#poetry.toml 4012 4013# pdm 4014# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 4015# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 4016# https://pdm-project.org/en/latest/usage/project/#working-with-version-control 4017#pdm.lock 4018#pdm.toml 4019.pdm-python 4020.pdm-build/ 4021 4022# pixi 4023# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 4024#pixi.lock 4025# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 4026# in the .venv directory. It is recommended not to include this directory in version control. 4027.pixi 4028 4029# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 4030__pypackages__/ 4031 4032# Celery stuff 4033celerybeat-schedule 4034celerybeat.pid 4035 4036# SageMath parsed files 4037*.sage.py 4038 4039# Environments 4040.env 4041.envrc 4042.venv 4043env/ 4044venv/ 4045ENV/ 4046env.bak/ 4047venv.bak/ 4048 4049# Spyder project settings 4050.spyderproject 4051.spyproject 4052 4053# Rope project settings 4054.ropeproject 4055 4056# mkdocs documentation 4057/site 4058 4059# mypy 4060.mypy_cache/ 4061.dmypy.json 4062dmypy.json 4063 4064# Pyre type checker 4065.pyre/ 4066 4067# pytype static type analyzer 4068.pytype/ 4069 4070# Cython debug symbols 4071cython_debug/ 4072 4073# PyCharm 4074# JetBrains specific template is maintained in a separate JetBrains.gitignore that can 4075# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 4076# and can be added to the global gitignore or merged into this file. For a more nuclear 4077# option (not recommended) you can uncomment the following to ignore the entire idea folder. 4078#.idea/ 4079 4080# Abstra 4081# Abstra is an AI-powered process automation framework. 4082# Ignore directories containing user credentials, local state, and settings. 4083# Learn more at https://abstra.io/docs 4084.abstra/ 4085 4086# Visual Studio Code 4087# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 4088# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 4089# and can be added to the global gitignore or merged into this file. However, if you prefer, 4090# you could uncomment the following to ignore the entire vscode folder 4091# .vscode/ 4092 4093# Ruff stuff: 4094.ruff_cache/ 4095 4096# PyPI configuration file 4097.pypirc 4098 4099# Marimo 4100marimo/_static/ 4101marimo/_lsp/ 4102__marimo__/ 4103 4104# Streamlit 4105.streamlit/secrets.toml 4106 4107thicket.yaml 4108</file> 4109 4110<file path="CLAUDE.md"> 4111My goal is to build a CLI tool called thicket in Python that maintains a Git repository within which Atom feeds can be persisted, including their contents. 4112 4113# Python Environment and Package Management 4114 4115This project uses `uv` for Python package management and virtual environment handling. 4116 4117## Running Commands 4118 4119ALWAYS use `uv run` to execute Python commands: 4120 4121- Run the CLI: `uv run -m thicket` 4122- Run tests: `uv run pytest` 4123- Type checking: `uv run mypy src/` 4124- Linting: `uv run ruff check src/` 4125- Format code: `uv run ruff format src/` 4126- Compile check: `uv run python -m py_compile <file>` 4127 4128## Package Management 4129 4130- Add dependencies: `uv add <package>` 4131- Add dev dependencies: `uv add --dev <package>` 4132- Install dependencies: `uv sync` 4133- Update dependencies: `uv lock --upgrade` 4134 4135# Project Structure 4136 4137The configuration file specifies: 4138- the location of a git store 4139- a list of usernames and target Atom/RSS feed(s) and optional metadata about the username such as their email, homepage, icon and display name 4140- a cache directory to store temporary results such as feed downloads and their last modification date that speed up operations across runs of the tool 4141 4142The Git data store should: 4143- have a subdirectory per user 4144- within that directory, an entry per Atom entry indexed by the Atom id for that entry. The id should be sanitised consistently to be a safe filename. RSS feed should be normalized to Atom before storing it. 4145- within each entry file, the metadata of the Atom feed converted into a JSON format that preserves as much metadata as possible. 4146- have a JSON file in the Git repository that indexes the users, their associated directories within the Git repository, and any other metadata about that user from the config file 4147The CLI should be modern and use cool progress bars and any otfrom ecosystem libraries. 4148 4149The intention behind the Git repository is that it can be queried by other websites in order to build a webblog structure of comments that link to other blogs. 4150</file> 4151 4152<file path="pyproject.toml"> 4153[build-system] 4154requires = ["hatchling"] 4155build-backend = "hatchling.build" 4156 4157[project] 4158name = "thicket" 4159dynamic = ["version"] 4160description = "A CLI tool for persisting Atom/RSS feeds in Git repositories" 4161readme = "README.md" 4162license = "MIT" 4163requires-python = ">=3.9" 4164authors = [ 4165 {name = "thicket", email = "thicket@example.com"}, 4166] 4167classifiers = [ 4168 "Development Status :: 3 - Alpha", 4169 "Intended Audience :: Developers", 4170 "License :: OSI Approved :: MIT License", 4171 "Operating System :: OS Independent", 4172 "Programming Language :: Python :: 3", 4173 "Programming Language :: Python :: 3.9", 4174 "Programming Language :: Python :: 3.10", 4175 "Programming Language :: Python :: 3.11", 4176 "Programming Language :: Python :: 3.12", 4177 "Programming Language :: Python :: 3.13", 4178 "Topic :: Internet :: WWW/HTTP :: Dynamic Content :: News/Diary", 4179 "Topic :: Software Development :: Version Control :: Git", 4180 "Topic :: Text Processing :: Markup :: XML", 4181] 4182dependencies = [ 4183 "typer>=0.15.0", 4184 "rich>=13.0.0", 4185 "GitPython>=3.1.40", 4186 "feedparser>=6.0.11", 4187 "pydantic>=2.11.0", 4188 "pydantic-settings>=2.10.0", 4189 "httpx>=0.28.0", 4190 "pendulum>=3.0.0", 4191 "bleach>=6.0.0", 4192 "platformdirs>=4.0.0", 4193 "pyyaml>=6.0.0", 4194 "email_validator", 4195 "jinja2>=3.1.6", 4196] 4197 4198[project.optional-dependencies] 4199dev = [ 4200 "pytest>=8.0.0", 4201 "pytest-asyncio>=0.24.0", 4202 "pytest-cov>=6.0.0", 4203 "black>=24.0.0", 4204 "ruff>=0.8.0", 4205 "mypy>=1.13.0", 4206 "types-PyYAML>=6.0.0", 4207] 4208 4209[project.urls] 4210Homepage = "https://github.com/example/thicket" 4211Documentation = "https://github.com/example/thicket" 4212Repository = "https://github.com/example/thicket" 4213"Bug Tracker" = "https://github.com/example/thicket/issues" 4214 4215[project.scripts] 4216thicket = "thicket.cli.main:app" 4217 4218[tool.hatch.version] 4219path = "src/thicket/__init__.py" 4220 4221[tool.hatch.build.targets.wheel] 4222packages = ["src/thicket"] 4223 4224[tool.black] 4225line-length = 88 4226target-version = ['py39'] 4227include = '\.pyi?$' 4228extend-exclude = ''' 4229/( 4230 # directories 4231 \.eggs 4232 | \.git 4233 | \.hg 4234 | \.mypy_cache 4235 | \.tox 4236 | \.venv 4237 | build 4238 | dist 4239)/ 4240''' 4241 4242[tool.ruff] 4243target-version = "py39" 4244line-length = 88 4245 4246[tool.ruff.lint] 4247select = [ 4248 "E", # pycodestyle errors 4249 "W", # pycodestyle warnings 4250 "F", # pyflakes 4251 "I", # isort 4252 "B", # flake8-bugbear 4253 "C4", # flake8-comprehensions 4254 "UP", # pyupgrade 4255] 4256ignore = [ 4257 "E501", # line too long, handled by black 4258 "B008", # do not perform function calls in argument defaults 4259 "C901", # too complex 4260] 4261 4262[tool.ruff.lint.per-file-ignores] 4263"__init__.py" = ["F401"] 4264 4265[tool.mypy] 4266python_version = "3.9" 4267check_untyped_defs = true 4268disallow_any_generics = true 4269disallow_incomplete_defs = true 4270disallow_untyped_defs = true 4271no_implicit_optional = true 4272warn_redundant_casts = true 4273warn_unused_ignores = true 4274warn_return_any = true 4275strict_optional = true 4276 4277[[tool.mypy.overrides]] 4278module = [ 4279 "feedparser", 4280 "git", 4281 "bleach", 4282] 4283ignore_missing_imports = true 4284 4285[tool.pytest.ini_options] 4286testpaths = ["tests"] 4287python_files = ["test_*.py"] 4288python_classes = ["Test*"] 4289python_functions = ["test_*"] 4290addopts = [ 4291 "-ra", 4292 "--strict-markers", 4293 "--strict-config", 4294 "--cov=src/thicket", 4295 "--cov-report=term-missing", 4296 "--cov-report=html", 4297 "--cov-report=xml", 4298] 4299filterwarnings = [ 4300 "error", 4301 "ignore::UserWarning", 4302 "ignore::DeprecationWarning", 4303] 4304markers = [ 4305 "slow: marks tests as slow (deselect with '-m \"not slow\"')", 4306 "integration: marks tests as integration tests", 4307] 4308 4309[tool.coverage.run] 4310source = ["src"] 4311branch = true 4312 4313[tool.coverage.report] 4314exclude_lines = [ 4315 "pragma: no cover", 4316 "def __repr__", 4317 "if self.debug:", 4318 "if settings.DEBUG", 4319 "raise AssertionError", 4320 "raise NotImplementedError", 4321 "if 0:", 4322 "if __name__ == .__main__.:", 4323 "class .*\\bProtocol\\):", 4324 "@(abc\\.)?abstractmethod", 4325] 4326</file> 4327 4328<file path="src/thicket/cli/commands/__init__.py"> 4329"""CLI commands for thicket.""" 4330 4331# Import all commands to register them with the main app 4332from . import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync 4333 4334__all__ = ["add", "duplicates", "generate", "index_cmd", "info_cmd", "init", "links_cmd", "list_cmd", "sync"] 4335</file> 4336 4337<file path="src/thicket/cli/commands/add.py"> 4338"""Add command for thicket.""" 4339 4340import asyncio 4341from pathlib import Path 4342from typing import Optional 4343 4344import typer 4345from pydantic import HttpUrl, ValidationError 4346 4347from ...core.feed_parser import FeedParser 4348from ...core.git_store import GitStore 4349from ..main import app 4350from ..utils import ( 4351 create_progress, 4352 load_config, 4353 print_error, 4354 print_info, 4355 print_success, 4356) 4357 4358 4359@app.command("add") 4360def add_command( 4361 subcommand: str = typer.Argument(..., help="Subcommand: 'user' or 'feed'"), 4362 username: str = typer.Argument(..., help="Username"), 4363 feed_url: Optional[str] = typer.Argument(None, help="Feed URL (required for 'user' command)"), 4364 email: Optional[str] = typer.Option(None, "--email", "-e", help="User email"), 4365 homepage: Optional[str] = typer.Option(None, "--homepage", "-h", help="User homepage"), 4366 icon: Optional[str] = typer.Option(None, "--icon", "-i", help="User icon URL"), 4367 display_name: Optional[str] = typer.Option(None, "--display-name", "-d", help="User display name"), 4368 config_file: Optional[Path] = typer.Option( 4369 Path("thicket.yaml"), "--config", help="Configuration file path" 4370 ), 4371 auto_discover: bool = typer.Option( 4372 True, "--auto-discover/--no-auto-discover", help="Auto-discover user metadata from feed" 4373 ), 4374) -> None: 4375 """Add a user or feed to thicket.""" 4376 4377 if subcommand == "user": 4378 add_user(username, feed_url, email, homepage, icon, display_name, config_file, auto_discover) 4379 elif subcommand == "feed": 4380 add_feed(username, feed_url, config_file) 4381 else: 4382 print_error(f"Unknown subcommand: {subcommand}") 4383 print_error("Use 'user' or 'feed'") 4384 raise typer.Exit(1) 4385 4386 4387def add_user( 4388 username: str, 4389 feed_url: Optional[str], 4390 email: Optional[str], 4391 homepage: Optional[str], 4392 icon: Optional[str], 4393 display_name: Optional[str], 4394 config_file: Path, 4395 auto_discover: bool, 4396) -> None: 4397 """Add a new user with feed.""" 4398 4399 if not feed_url: 4400 print_error("Feed URL is required when adding a user") 4401 raise typer.Exit(1) 4402 4403 # Validate feed URL 4404 try: 4405 validated_feed_url = HttpUrl(feed_url) 4406 except ValidationError: 4407 print_error(f"Invalid feed URL: {feed_url}") 4408 raise typer.Exit(1) from None 4409 4410 # Load configuration 4411 config = load_config(config_file) 4412 4413 # Initialize Git store 4414 git_store = GitStore(config.git_store) 4415 4416 # Check if user already exists 4417 existing_user = git_store.get_user(username) 4418 if existing_user: 4419 print_error(f"User '{username}' already exists") 4420 print_error("Use 'thicket add feed' to add additional feeds") 4421 raise typer.Exit(1) 4422 4423 # Auto-discover metadata if enabled 4424 discovered_metadata = None 4425 if auto_discover: 4426 discovered_metadata = asyncio.run(discover_feed_metadata(validated_feed_url)) 4427 4428 # Prepare user data with manual overrides taking precedence 4429 user_display_name = display_name or (discovered_metadata.author_name or discovered_metadata.title if discovered_metadata else None) 4430 user_email = email or (discovered_metadata.author_email if discovered_metadata else None) 4431 user_homepage = homepage or (str(discovered_metadata.author_uri or discovered_metadata.link) if discovered_metadata else None) 4432 user_icon = icon or (str(discovered_metadata.logo or discovered_metadata.icon or discovered_metadata.image_url) if discovered_metadata else None) 4433 4434 # Add user to Git store 4435 git_store.add_user( 4436 username=username, 4437 display_name=user_display_name, 4438 email=user_email, 4439 homepage=user_homepage, 4440 icon=user_icon, 4441 feeds=[str(validated_feed_url)], 4442 ) 4443 4444 # Commit changes 4445 git_store.commit_changes(f"Add user: {username}") 4446 4447 print_success(f"Added user '{username}' with feed: {feed_url}") 4448 4449 if discovered_metadata and auto_discover: 4450 print_info("Auto-discovered metadata:") 4451 if user_display_name: 4452 print_info(f" Display name: {user_display_name}") 4453 if user_email: 4454 print_info(f" Email: {user_email}") 4455 if user_homepage: 4456 print_info(f" Homepage: {user_homepage}") 4457 if user_icon: 4458 print_info(f" Icon: {user_icon}") 4459 4460 4461def add_feed(username: str, feed_url: Optional[str], config_file: Path) -> None: 4462 """Add a feed to an existing user.""" 4463 4464 if not feed_url: 4465 print_error("Feed URL is required") 4466 raise typer.Exit(1) 4467 4468 # Validate feed URL 4469 try: 4470 validated_feed_url = HttpUrl(feed_url) 4471 except ValidationError: 4472 print_error(f"Invalid feed URL: {feed_url}") 4473 raise typer.Exit(1) from None 4474 4475 # Load configuration 4476 config = load_config(config_file) 4477 4478 # Initialize Git store 4479 git_store = GitStore(config.git_store) 4480 4481 # Check if user exists 4482 user = git_store.get_user(username) 4483 if not user: 4484 print_error(f"User '{username}' not found") 4485 print_error("Use 'thicket add user' to add a new user") 4486 raise typer.Exit(1) 4487 4488 # Check if feed already exists 4489 if str(validated_feed_url) in user.feeds: 4490 print_error(f"Feed already exists for user '{username}': {feed_url}") 4491 raise typer.Exit(1) 4492 4493 # Add feed to user 4494 updated_feeds = user.feeds + [str(validated_feed_url)] 4495 if git_store.update_user(username, feeds=updated_feeds): 4496 git_store.commit_changes(f"Add feed to user {username}: {feed_url}") 4497 print_success(f"Added feed to user '{username}': {feed_url}") 4498 else: 4499 print_error(f"Failed to add feed to user '{username}'") 4500 raise typer.Exit(1) 4501 4502 4503async def discover_feed_metadata(feed_url: HttpUrl): 4504 """Discover metadata from a feed URL.""" 4505 try: 4506 with create_progress() as progress: 4507 task = progress.add_task("Discovering feed metadata...", total=None) 4508 4509 parser = FeedParser() 4510 content = await parser.fetch_feed(feed_url) 4511 metadata, _ = parser.parse_feed(content, feed_url) 4512 4513 progress.update(task, completed=True) 4514 return metadata 4515 4516 except Exception as e: 4517 print_error(f"Failed to discover feed metadata: {e}") 4518 return None 4519</file> 4520 4521<file path="src/thicket/cli/commands/duplicates.py"> 4522"""Duplicates command for thicket.""" 4523 4524from pathlib import Path 4525from typing import Optional 4526 4527import typer 4528from rich.table import Table 4529 4530from ...core.git_store import GitStore 4531from ..main import app 4532from ..utils import ( 4533 console, 4534 load_config, 4535 print_error, 4536 print_info, 4537 print_success, 4538 get_tsv_mode, 4539) 4540 4541 4542@app.command("duplicates") 4543def duplicates_command( 4544 action: str = typer.Argument(..., help="Action: 'list', 'add', 'remove'"), 4545 duplicate_id: Optional[str] = typer.Argument(None, help="Duplicate entry ID"), 4546 canonical_id: Optional[str] = typer.Argument(None, help="Canonical entry ID"), 4547 config_file: Optional[Path] = typer.Option( 4548 Path("thicket.yaml"), "--config", help="Configuration file path" 4549 ), 4550) -> None: 4551 """Manage duplicate entry mappings.""" 4552 4553 # Load configuration 4554 config = load_config(config_file) 4555 4556 # Initialize Git store 4557 git_store = GitStore(config.git_store) 4558 4559 if action == "list": 4560 list_duplicates(git_store) 4561 elif action == "add": 4562 add_duplicate(git_store, duplicate_id, canonical_id) 4563 elif action == "remove": 4564 remove_duplicate(git_store, duplicate_id) 4565 else: 4566 print_error(f"Unknown action: {action}") 4567 print_error("Use 'list', 'add', or 'remove'") 4568 raise typer.Exit(1) 4569 4570 4571def list_duplicates(git_store: GitStore) -> None: 4572 """List all duplicate mappings.""" 4573 duplicates = git_store.get_duplicates() 4574 4575 if not duplicates.duplicates: 4576 if get_tsv_mode(): 4577 print("No duplicate mappings found") 4578 else: 4579 print_info("No duplicate mappings found") 4580 return 4581 4582 if get_tsv_mode(): 4583 print("Duplicate ID\tCanonical ID") 4584 for duplicate_id, canonical_id in duplicates.duplicates.items(): 4585 print(f"{duplicate_id}\t{canonical_id}") 4586 print(f"Total duplicates: {len(duplicates.duplicates)}") 4587 else: 4588 table = Table(title="Duplicate Entry Mappings") 4589 table.add_column("Duplicate ID", style="red") 4590 table.add_column("Canonical ID", style="green") 4591 4592 for duplicate_id, canonical_id in duplicates.duplicates.items(): 4593 table.add_row(duplicate_id, canonical_id) 4594 4595 console.print(table) 4596 print_info(f"Total duplicates: {len(duplicates.duplicates)}") 4597 4598 4599def add_duplicate(git_store: GitStore, duplicate_id: Optional[str], canonical_id: Optional[str]) -> None: 4600 """Add a duplicate mapping.""" 4601 if not duplicate_id: 4602 print_error("Duplicate ID is required") 4603 raise typer.Exit(1) 4604 4605 if not canonical_id: 4606 print_error("Canonical ID is required") 4607 raise typer.Exit(1) 4608 4609 # Check if duplicate_id already exists 4610 duplicates = git_store.get_duplicates() 4611 if duplicates.is_duplicate(duplicate_id): 4612 existing_canonical = duplicates.get_canonical(duplicate_id) 4613 print_error(f"Duplicate ID already mapped to: {existing_canonical}") 4614 print_error("Use 'remove' first to change the mapping") 4615 raise typer.Exit(1) 4616 4617 # Check if we're trying to make a canonical ID point to itself 4618 if duplicate_id == canonical_id: 4619 print_error("Duplicate ID cannot be the same as canonical ID") 4620 raise typer.Exit(1) 4621 4622 # Add the mapping 4623 git_store.add_duplicate(duplicate_id, canonical_id) 4624 4625 # Commit changes 4626 git_store.commit_changes(f"Add duplicate mapping: {duplicate_id} -> {canonical_id}") 4627 4628 print_success(f"Added duplicate mapping: {duplicate_id} -> {canonical_id}") 4629 4630 4631def remove_duplicate(git_store: GitStore, duplicate_id: Optional[str]) -> None: 4632 """Remove a duplicate mapping.""" 4633 if not duplicate_id: 4634 print_error("Duplicate ID is required") 4635 raise typer.Exit(1) 4636 4637 # Check if mapping exists 4638 duplicates = git_store.get_duplicates() 4639 if not duplicates.is_duplicate(duplicate_id): 4640 print_error(f"No duplicate mapping found for: {duplicate_id}") 4641 raise typer.Exit(1) 4642 4643 canonical_id = duplicates.get_canonical(duplicate_id) 4644 4645 # Remove the mapping 4646 if git_store.remove_duplicate(duplicate_id): 4647 # Commit changes 4648 git_store.commit_changes(f"Remove duplicate mapping: {duplicate_id} -> {canonical_id}") 4649 print_success(f"Removed duplicate mapping: {duplicate_id} -> {canonical_id}") 4650 else: 4651 print_error(f"Failed to remove duplicate mapping: {duplicate_id}") 4652 raise typer.Exit(1) 4653</file> 4654 4655<file path="src/thicket/cli/commands/sync.py"> 4656"""Sync command for thicket.""" 4657 4658import asyncio 4659from pathlib import Path 4660from typing import Optional 4661 4662import typer 4663from rich.progress import track 4664 4665from ...core.feed_parser import FeedParser 4666from ...core.git_store import GitStore 4667from ..main import app 4668from ..utils import ( 4669 load_config, 4670 print_error, 4671 print_info, 4672 print_success, 4673) 4674 4675 4676@app.command() 4677def sync( 4678 all_users: bool = typer.Option( 4679 False, "--all", "-a", help="Sync all users and feeds" 4680 ), 4681 user: Optional[str] = typer.Option( 4682 None, "--user", "-u", help="Sync specific user only" 4683 ), 4684 config_file: Optional[Path] = typer.Option( 4685 Path("thicket.yaml"), "--config", help="Configuration file path" 4686 ), 4687 dry_run: bool = typer.Option( 4688 False, "--dry-run", help="Show what would be synced without making changes" 4689 ), 4690) -> None: 4691 """Sync feeds and store entries in Git repository.""" 4692 4693 # Load configuration 4694 config = load_config(config_file) 4695 4696 # Initialize Git store 4697 git_store = GitStore(config.git_store) 4698 4699 # Determine which users to sync from git repository 4700 users_to_sync = [] 4701 if all_users: 4702 index = git_store._load_index() 4703 users_to_sync = list(index.users.values()) 4704 elif user: 4705 user_metadata = git_store.get_user(user) 4706 if not user_metadata: 4707 print_error(f"User '{user}' not found in git repository") 4708 raise typer.Exit(1) 4709 users_to_sync = [user_metadata] 4710 else: 4711 print_error("Specify --all to sync all users or --user to sync a specific user") 4712 raise typer.Exit(1) 4713 4714 if not users_to_sync: 4715 print_info("No users configured to sync") 4716 return 4717 4718 # Sync each user 4719 total_new_entries = 0 4720 total_updated_entries = 0 4721 4722 for user_metadata in users_to_sync: 4723 print_info(f"Syncing user: {user_metadata.username}") 4724 4725 user_new_entries = 0 4726 user_updated_entries = 0 4727 4728 # Sync each feed for the user 4729 for feed_url in track(user_metadata.feeds, description=f"Syncing {user_metadata.username}'s feeds"): 4730 try: 4731 new_entries, updated_entries = asyncio.run( 4732 sync_feed(git_store, user_metadata.username, feed_url, dry_run) 4733 ) 4734 user_new_entries += new_entries 4735 user_updated_entries += updated_entries 4736 4737 except Exception as e: 4738 print_error(f"Failed to sync feed {feed_url}: {e}") 4739 continue 4740 4741 print_info(f"User {user_metadata.username}: {user_new_entries} new, {user_updated_entries} updated") 4742 total_new_entries += user_new_entries 4743 total_updated_entries += user_updated_entries 4744 4745 # Commit changes if not dry run 4746 if not dry_run and (total_new_entries > 0 or total_updated_entries > 0): 4747 commit_message = f"Sync feeds: {total_new_entries} new entries, {total_updated_entries} updated" 4748 git_store.commit_changes(commit_message) 4749 print_success(f"Committed changes: {commit_message}") 4750 4751 # Summary 4752 if dry_run: 4753 print_info(f"Dry run complete: would sync {total_new_entries} new entries, {total_updated_entries} updated") 4754 else: 4755 print_success(f"Sync complete: {total_new_entries} new entries, {total_updated_entries} updated") 4756 4757 4758async def sync_feed(git_store: GitStore, username: str, feed_url, dry_run: bool) -> tuple[int, int]: 4759 """Sync a single feed for a user.""" 4760 4761 parser = FeedParser() 4762 4763 try: 4764 # Fetch and parse feed 4765 content = await parser.fetch_feed(feed_url) 4766 metadata, entries = parser.parse_feed(content, feed_url) 4767 4768 new_entries = 0 4769 updated_entries = 0 4770 4771 # Process each entry 4772 for entry in entries: 4773 try: 4774 # Check if entry already exists 4775 existing_entry = git_store.get_entry(username, entry.id) 4776 4777 if existing_entry: 4778 # Check if entry has been updated 4779 if existing_entry.updated != entry.updated: 4780 if not dry_run: 4781 git_store.store_entry(username, entry) 4782 updated_entries += 1 4783 else: 4784 # New entry 4785 if not dry_run: 4786 git_store.store_entry(username, entry) 4787 new_entries += 1 4788 4789 except Exception as e: 4790 print_error(f"Failed to process entry {entry.id}: {e}") 4791 continue 4792 4793 return new_entries, updated_entries 4794 4795 except Exception as e: 4796 print_error(f"Failed to sync feed {feed_url}: {e}") 4797 return 0, 0 4798</file> 4799 4800<file path="src/thicket/models/config.py"> 4801"""Configuration models for thicket.""" 4802 4803from pathlib import Path 4804from typing import Optional 4805 4806from pydantic import BaseModel, EmailStr, HttpUrl 4807from pydantic_settings import BaseSettings, SettingsConfigDict 4808 4809 4810class UserConfig(BaseModel): 4811 """Configuration for a single user and their feeds.""" 4812 4813 username: str 4814 feeds: list[HttpUrl] 4815 email: Optional[EmailStr] = None 4816 homepage: Optional[HttpUrl] = None 4817 icon: Optional[HttpUrl] = None 4818 display_name: Optional[str] = None 4819 4820 4821class ThicketConfig(BaseSettings): 4822 """Main configuration for thicket.""" 4823 4824 model_config = SettingsConfigDict( 4825 env_prefix="THICKET_", 4826 env_file=".env", 4827 yaml_file="thicket.yaml", 4828 case_sensitive=False, 4829 ) 4830 4831 git_store: Path 4832 cache_dir: Path 4833 users: list[UserConfig] = [] 4834</file> 4835 4836<file path="src/thicket/cli/commands/links_cmd.py"> 4837"""CLI command for extracting and categorizing all outbound links from blog entries.""" 4838 4839import json 4840import re 4841from pathlib import Path 4842from typing import Dict, List, Optional, Set 4843from urllib.parse import urljoin, urlparse 4844 4845import typer 4846from rich.console import Console 4847from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn 4848from rich.table import Table 4849 4850from ...core.git_store import GitStore 4851from ..main import app 4852from ..utils import load_config, get_tsv_mode 4853 4854console = Console() 4855 4856 4857class LinkData: 4858 """Represents a link found in a blog entry.""" 4859 4860 def __init__(self, url: str, entry_id: str, username: str): 4861 self.url = url 4862 self.entry_id = entry_id 4863 self.username = username 4864 4865 def to_dict(self) -> dict: 4866 """Convert to dictionary for JSON serialization.""" 4867 return { 4868 "url": self.url, 4869 "entry_id": self.entry_id, 4870 "username": self.username 4871 } 4872 4873 @classmethod 4874 def from_dict(cls, data: dict) -> "LinkData": 4875 """Create from dictionary.""" 4876 return cls( 4877 url=data["url"], 4878 entry_id=data["entry_id"], 4879 username=data["username"] 4880 ) 4881 4882 4883class LinkCategorizer: 4884 """Categorizes links as internal, user, or unknown.""" 4885 4886 def __init__(self, user_domains: Dict[str, Set[str]]): 4887 self.user_domains = user_domains 4888 # Create reverse mapping of domain -> username 4889 self.domain_to_user = {} 4890 for username, domains in user_domains.items(): 4891 for domain in domains: 4892 self.domain_to_user[domain] = username 4893 4894 def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]: 4895 """ 4896 Categorize a URL as 'internal', 'user', or 'unknown'. 4897 Returns (category, target_username). 4898 """ 4899 try: 4900 parsed = urlparse(url) 4901 domain = parsed.netloc.lower() 4902 4903 # Check if it's a link to the same user's domain (internal) 4904 if domain in self.user_domains.get(source_username, set()): 4905 return "internal", source_username 4906 4907 # Check if it's a link to another user's domain 4908 if domain in self.domain_to_user: 4909 return "user", self.domain_to_user[domain] 4910 4911 # Everything else is unknown 4912 return "unknown", None 4913 4914 except Exception: 4915 return "unknown", None 4916 4917 4918class LinkExtractor: 4919 """Extracts and resolves links from blog entries.""" 4920 4921 def __init__(self): 4922 # Pattern for extracting links from HTML 4923 self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL) 4924 self.url_pattern = re.compile(r'https?://[^\s<>"]+') 4925 4926 def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]: 4927 """Extract all links from HTML content and resolve them against base URL.""" 4928 links = [] 4929 4930 # Extract links from <a> tags 4931 for match in self.link_pattern.finditer(html_content): 4932 url = match.group(1) 4933 text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text 4934 4935 # Resolve relative URLs against base URL 4936 resolved_url = urljoin(base_url, url) 4937 links.append((resolved_url, text)) 4938 4939 return links 4940 4941 4942 def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]: 4943 """Extract all links from a blog entry.""" 4944 links = [] 4945 4946 # Combine all text content for analysis 4947 content_to_search = [] 4948 if entry.content: 4949 content_to_search.append(entry.content) 4950 if entry.summary: 4951 content_to_search.append(entry.summary) 4952 4953 for content in content_to_search: 4954 extracted_links = self.extract_links_from_html(content, base_url) 4955 4956 for url, link_text in extracted_links: 4957 # Skip empty URLs 4958 if not url or url.startswith('#'): 4959 continue 4960 4961 link_data = LinkData( 4962 url=url, 4963 entry_id=entry.id, 4964 username=username 4965 ) 4966 4967 links.append(link_data) 4968 4969 return links 4970 4971 4972@app.command() 4973def links( 4974 config_file: Optional[Path] = typer.Option( 4975 Path("thicket.yaml"), 4976 "--config", 4977 "-c", 4978 help="Path to configuration file", 4979 ), 4980 output_file: Optional[Path] = typer.Option( 4981 None, 4982 "--output", 4983 "-o", 4984 help="Path to output unified links file (default: links.json in git store)", 4985 ), 4986 verbose: bool = typer.Option( 4987 False, 4988 "--verbose", 4989 "-v", 4990 help="Show detailed progress information", 4991 ), 4992) -> None: 4993 """Extract and categorize all outbound links from blog entries. 4994 4995 This command analyzes all blog entries to extract outbound links, 4996 resolve them properly with respect to the feed's base URL, and 4997 categorize them as internal, user, or unknown links. 4998 4999 Creates a unified links.json file containing all link data. 5000 """ 5001 try: 5002 # Load configuration 5003 config = load_config(config_file) 5004 5005 # Initialize Git store 5006 git_store = GitStore(config.git_store) 5007 5008 # Build user domain mapping 5009 if verbose: 5010 console.print("Building user domain mapping...") 5011 5012 index = git_store._load_index() 5013 user_domains = {} 5014 5015 for username, user_metadata in index.users.items(): 5016 domains = set() 5017 5018 # Add domains from feeds 5019 for feed_url in user_metadata.feeds: 5020 domain = urlparse(feed_url).netloc.lower() 5021 if domain: 5022 domains.add(domain) 5023 5024 # Add domain from homepage 5025 if user_metadata.homepage: 5026 domain = urlparse(str(user_metadata.homepage)).netloc.lower() 5027 if domain: 5028 domains.add(domain) 5029 5030 user_domains[username] = domains 5031 5032 if verbose: 5033 console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains") 5034 5035 # Initialize components 5036 link_extractor = LinkExtractor() 5037 categorizer = LinkCategorizer(user_domains) 5038 5039 # Get all users 5040 users = list(index.users.keys()) 5041 5042 if not users: 5043 console.print("[yellow]No users found in Git store[/yellow]") 5044 raise typer.Exit(0) 5045 5046 # Process all entries 5047 all_links = [] 5048 link_categories = {"internal": [], "user": [], "unknown": []} 5049 link_dict = {} # Dictionary with link URL as key, maps to list of atom IDs 5050 reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs 5051 5052 with Progress( 5053 SpinnerColumn(), 5054 TextColumn("[progress.description]{task.description}"), 5055 BarColumn(), 5056 TaskProgressColumn(), 5057 console=console, 5058 ) as progress: 5059 5060 # Count total entries first 5061 counting_task = progress.add_task("Counting entries...", total=len(users)) 5062 total_entries = 0 5063 5064 for username in users: 5065 entries = git_store.list_entries(username) 5066 total_entries += len(entries) 5067 progress.advance(counting_task) 5068 5069 progress.remove_task(counting_task) 5070 5071 # Process entries 5072 processing_task = progress.add_task( 5073 f"Processing {total_entries} entries...", 5074 total=total_entries 5075 ) 5076 5077 for username in users: 5078 entries = git_store.list_entries(username) 5079 user_metadata = index.users[username] 5080 5081 # Get base URL for this user (use first feed URL) 5082 base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com" 5083 5084 for entry in entries: 5085 # Extract links from this entry 5086 entry_links = link_extractor.extract_links_from_entry(entry, username, base_url) 5087 5088 # Track unique links per entry 5089 entry_urls_seen = set() 5090 5091 # Categorize each link 5092 for link_data in entry_links: 5093 # Skip if we've already seen this URL in this entry 5094 if link_data.url in entry_urls_seen: 5095 continue 5096 entry_urls_seen.add(link_data.url) 5097 5098 category, target_username = categorizer.categorize_url(link_data.url, username) 5099 5100 # Add to link dictionary (URL as key, maps to list of atom IDs) 5101 if link_data.url not in link_dict: 5102 link_dict[link_data.url] = [] 5103 if link_data.entry_id not in link_dict[link_data.url]: 5104 link_dict[link_data.url].append(link_data.entry_id) 5105 5106 # Also add to reverse mapping (atom ID -> list of URLs) 5107 if link_data.entry_id not in reverse_dict: 5108 reverse_dict[link_data.entry_id] = [] 5109 if link_data.url not in reverse_dict[link_data.entry_id]: 5110 reverse_dict[link_data.entry_id].append(link_data.url) 5111 5112 # Add category info to link data for categories tracking 5113 link_info = link_data.to_dict() 5114 link_info["category"] = category 5115 link_info["target_username"] = target_username 5116 5117 all_links.append(link_info) 5118 link_categories[category].append(link_info) 5119 5120 progress.advance(processing_task) 5121 5122 if verbose and entry_links: 5123 console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...") 5124 5125 # Determine output path 5126 if output_file: 5127 output_path = output_file 5128 else: 5129 output_path = config.git_store / "links.json" 5130 5131 # Save all extracted links (not just filtered ones) 5132 if verbose: 5133 console.print("Preparing output data...") 5134 5135 # Build a set of all URLs that correspond to posts in the git database 5136 registered_urls = set() 5137 5138 # Get all entries from all users and build URL mappings 5139 for username in users: 5140 entries = git_store.list_entries(username) 5141 user_metadata = index.users[username] 5142 5143 for entry in entries: 5144 # Try to match entry URLs with extracted links 5145 if hasattr(entry, 'link') and entry.link: 5146 registered_urls.add(str(entry.link)) 5147 5148 # Also check entry alternate links if they exist 5149 if hasattr(entry, 'links') and entry.links: 5150 for link in entry.links: 5151 if hasattr(link, 'href') and link.href: 5152 registered_urls.add(str(link.href)) 5153 5154 # Build unified structure with metadata 5155 unified_links = {} 5156 reverse_mapping = {} 5157 5158 for url, entry_ids in link_dict.items(): 5159 unified_links[url] = { 5160 "referencing_entries": entry_ids 5161 } 5162 5163 # Find target username if this is a tracked post 5164 if url in registered_urls: 5165 for username in users: 5166 user_domains_set = {domain for domain in user_domains.get(username, [])} 5167 if any(domain in url for domain in user_domains_set): 5168 unified_links[url]["target_username"] = username 5169 break 5170 5171 # Build reverse mapping 5172 for entry_id in entry_ids: 5173 if entry_id not in reverse_mapping: 5174 reverse_mapping[entry_id] = [] 5175 if url not in reverse_mapping[entry_id]: 5176 reverse_mapping[entry_id].append(url) 5177 5178 # Create unified output data 5179 output_data = { 5180 "links": unified_links, 5181 "reverse_mapping": reverse_mapping, 5182 "user_domains": {k: list(v) for k, v in user_domains.items()} 5183 } 5184 5185 if verbose: 5186 console.print(f"Found {len(registered_urls)} registered post URLs") 5187 console.print(f"Found {len(link_dict)} total links, {sum(1 for link in unified_links.values() if 'target_username' in link)} tracked posts") 5188 5189 # Save unified data 5190 with open(output_path, "w") as f: 5191 json.dump(output_data, f, indent=2, default=str) 5192 5193 # Show summary 5194 if not get_tsv_mode(): 5195 console.print("\n[green]✓ Links extraction completed successfully[/green]") 5196 5197 # Create summary table or TSV output 5198 if get_tsv_mode(): 5199 print("Category\tCount\tDescription") 5200 print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain") 5201 print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users") 5202 print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites") 5203 print(f"Total Extracted\t{len(all_links)}\tAll extracted links") 5204 print(f"Saved to Output\t{len(output_data['links'])}\tLinks saved to output file") 5205 print(f"Cross-references\t{sum(1 for link in unified_links.values() if 'target_username' in link)}\tLinks to registered posts only") 5206 else: 5207 table = Table(title="Links Summary") 5208 table.add_column("Category", style="cyan") 5209 table.add_column("Count", style="green") 5210 table.add_column("Description", style="white") 5211 5212 table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain") 5213 table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users") 5214 table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites") 5215 table.add_row("Total Extracted", str(len(all_links)), "All extracted links") 5216 table.add_row("Saved to Output", str(len(output_data['links'])), "Links saved to output file") 5217 table.add_row("Cross-references", str(sum(1 for link in unified_links.values() if 'target_username' in link)), "Links to registered posts only") 5218 5219 console.print(table) 5220 5221 # Show user links if verbose 5222 if verbose and link_categories["user"]: 5223 if get_tsv_mode(): 5224 print("User Link Source\tUser Link Target\tLink Count") 5225 user_link_counts = {} 5226 5227 for link in link_categories["user"]: 5228 key = f"{link['username']} -> {link['target_username']}" 5229 user_link_counts[key] = user_link_counts.get(key, 0) + 1 5230 5231 for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]: 5232 source, target = link_pair.split(" -> ") 5233 print(f"{source}\t{target}\t{count}") 5234 else: 5235 console.print("\n[bold]User-to-user links:[/bold]") 5236 user_link_counts = {} 5237 5238 for link in link_categories["user"]: 5239 key = f"{link['username']} -> {link['target_username']}" 5240 user_link_counts[key] = user_link_counts.get(key, 0) + 1 5241 5242 for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]: 5243 console.print(f" {link_pair}: {count} links") 5244 5245 if not get_tsv_mode(): 5246 console.print(f"\nUnified links data saved to: {output_path}") 5247 5248 except Exception as e: 5249 console.print(f"[red]Error extracting links: {e}[/red]") 5250 if verbose: 5251 console.print_exception() 5252 raise typer.Exit(1) 5253</file> 5254 5255<file path="src/thicket/cli/commands/list_cmd.py"> 5256"""List command for thicket.""" 5257 5258import re 5259from pathlib import Path 5260from typing import Optional 5261 5262import typer 5263from rich.table import Table 5264 5265from ...core.git_store import GitStore 5266from ..main import app 5267from ..utils import ( 5268 console, 5269 load_config, 5270 print_error, 5271 print_feeds_table, 5272 print_feeds_table_from_git, 5273 print_info, 5274 print_users_table, 5275 print_users_table_from_git, 5276 print_entries_tsv, 5277 get_tsv_mode, 5278) 5279 5280 5281@app.command("list") 5282def list_command( 5283 what: str = typer.Argument(..., help="What to list: 'users', 'feeds', 'entries'"), 5284 user: Optional[str] = typer.Option( 5285 None, "--user", "-u", help="Filter by specific user" 5286 ), 5287 limit: Optional[int] = typer.Option( 5288 None, "--limit", "-l", help="Limit number of results" 5289 ), 5290 config_file: Optional[Path] = typer.Option( 5291 Path("thicket.yaml"), "--config", help="Configuration file path" 5292 ), 5293) -> None: 5294 """List users, feeds, or entries.""" 5295 5296 # Load configuration 5297 config = load_config(config_file) 5298 5299 # Initialize Git store 5300 git_store = GitStore(config.git_store) 5301 5302 if what == "users": 5303 list_users(git_store) 5304 elif what == "feeds": 5305 list_feeds(git_store, user) 5306 elif what == "entries": 5307 list_entries(git_store, user, limit) 5308 else: 5309 print_error(f"Unknown list type: {what}") 5310 print_error("Use 'users', 'feeds', or 'entries'") 5311 raise typer.Exit(1) 5312 5313 5314def list_users(git_store: GitStore) -> None: 5315 """List all users.""" 5316 index = git_store._load_index() 5317 users = list(index.users.values()) 5318 5319 if not users: 5320 print_info("No users configured") 5321 return 5322 5323 print_users_table_from_git(users) 5324 5325 5326def list_feeds(git_store: GitStore, username: Optional[str] = None) -> None: 5327 """List feeds, optionally filtered by user.""" 5328 if username: 5329 user = git_store.get_user(username) 5330 if not user: 5331 print_error(f"User '{username}' not found") 5332 raise typer.Exit(1) 5333 5334 if not user.feeds: 5335 print_info(f"No feeds configured for user '{username}'") 5336 return 5337 5338 print_feeds_table_from_git(git_store, username) 5339 5340 5341def list_entries(git_store: GitStore, username: Optional[str] = None, limit: Optional[int] = None) -> None: 5342 """List entries, optionally filtered by user.""" 5343 5344 if username: 5345 # List entries for specific user 5346 user = git_store.get_user(username) 5347 if not user: 5348 print_error(f"User '{username}' not found") 5349 raise typer.Exit(1) 5350 5351 entries = git_store.list_entries(username, limit) 5352 if not entries: 5353 print_info(f"No entries found for user '{username}'") 5354 return 5355 5356 print_entries_table([entries], [username]) 5357 5358 else: 5359 # List entries for all users 5360 all_entries = [] 5361 all_usernames = [] 5362 5363 index = git_store._load_index() 5364 for user in index.users.values(): 5365 entries = git_store.list_entries(user.username, limit) 5366 if entries: 5367 all_entries.append(entries) 5368 all_usernames.append(user.username) 5369 5370 if not all_entries: 5371 print_info("No entries found") 5372 return 5373 5374 print_entries_table(all_entries, all_usernames) 5375 5376 5377def _clean_html_content(content: Optional[str]) -> str: 5378 """Clean HTML content for display in table.""" 5379 if not content: 5380 return "" 5381 5382 # Remove HTML tags 5383 clean_text = re.sub(r'<[^>]+>', ' ', content) 5384 # Replace multiple whitespace with single space 5385 clean_text = re.sub(r'\s+', ' ', clean_text) 5386 # Strip and limit length 5387 clean_text = clean_text.strip() 5388 if len(clean_text) > 100: 5389 clean_text = clean_text[:97] + "..." 5390 5391 return clean_text 5392 5393 5394def print_entries_table(entries_by_user: list[list], usernames: list[str]) -> None: 5395 """Print a table of entries.""" 5396 if get_tsv_mode(): 5397 print_entries_tsv(entries_by_user, usernames) 5398 return 5399 5400 table = Table(title="Feed Entries") 5401 table.add_column("User", style="cyan", no_wrap=True) 5402 table.add_column("Title", style="bold") 5403 table.add_column("Updated", style="blue") 5404 table.add_column("URL", style="green") 5405 5406 # Combine all entries with usernames 5407 all_entries = [] 5408 for entries, username in zip(entries_by_user, usernames): 5409 for entry in entries: 5410 all_entries.append((username, entry)) 5411 5412 # Sort by updated time (newest first) 5413 all_entries.sort(key=lambda x: x[1].updated, reverse=True) 5414 5415 for username, entry in all_entries: 5416 # Format updated time 5417 updated_str = entry.updated.strftime("%Y-%m-%d %H:%M") 5418 5419 # Truncate title if too long 5420 title = entry.title 5421 if len(title) > 50: 5422 title = title[:47] + "..." 5423 5424 table.add_row( 5425 username, 5426 title, 5427 updated_str, 5428 str(entry.link), 5429 ) 5430 5431 console.print(table) 5432</file> 5433 5434<file path="src/thicket/cli/main.py"> 5435"""Main CLI application using Typer.""" 5436 5437import typer 5438from rich.console import Console 5439 5440from .. import __version__ 5441 5442app = typer.Typer( 5443 name="thicket", 5444 help="A CLI tool for persisting Atom/RSS feeds in Git repositories", 5445 no_args_is_help=True, 5446 rich_markup_mode="rich", 5447) 5448 5449console = Console() 5450 5451# Global state for TSV output mode 5452tsv_mode = False 5453 5454 5455def version_callback(value: bool) -> None: 5456 """Show version and exit.""" 5457 if value: 5458 console.print(f"thicket version {__version__}") 5459 raise typer.Exit() 5460 5461 5462@app.callback() 5463def main( 5464 version: bool = typer.Option( 5465 None, 5466 "--version", 5467 "-v", 5468 help="Show the version and exit", 5469 callback=version_callback, 5470 is_eager=True, 5471 ), 5472 tsv: bool = typer.Option( 5473 False, 5474 "--tsv", 5475 help="Output in tab-separated values format without truncation", 5476 ), 5477) -> None: 5478 """Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories.""" 5479 global tsv_mode 5480 tsv_mode = tsv 5481 5482 5483# Import commands to register them 5484from .commands import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync 5485 5486if __name__ == "__main__": 5487 app() 5488</file> 5489 5490<file path="src/thicket/core/git_store.py"> 5491"""Git repository operations for thicket.""" 5492 5493import json 5494from datetime import datetime 5495from pathlib import Path 5496from typing import Optional 5497 5498import git 5499from git import Repo 5500 5501from ..models import AtomEntry, DuplicateMap, GitStoreIndex, UserMetadata 5502 5503 5504class GitStore: 5505 """Manages the Git repository for storing feed entries.""" 5506 5507 def __init__(self, repo_path: Path): 5508 """Initialize the Git store.""" 5509 self.repo_path = repo_path 5510 self.repo: Optional[Repo] = None 5511 self._ensure_repo() 5512 5513 def _ensure_repo(self) -> None: 5514 """Ensure the Git repository exists and is initialized.""" 5515 if not self.repo_path.exists(): 5516 self.repo_path.mkdir(parents=True, exist_ok=True) 5517 5518 try: 5519 self.repo = Repo(self.repo_path) 5520 except git.InvalidGitRepositoryError: 5521 # Initialize new repository 5522 self.repo = Repo.init(self.repo_path) 5523 self._create_initial_structure() 5524 5525 def _create_initial_structure(self) -> None: 5526 """Create initial Git store structure.""" 5527 # Create index.json 5528 index = GitStoreIndex( 5529 created=datetime.now(), 5530 last_updated=datetime.now(), 5531 ) 5532 self._save_index(index) 5533 5534 # Create duplicates.json 5535 duplicates = DuplicateMap() 5536 self._save_duplicates(duplicates) 5537 5538 # Create initial commit 5539 self.repo.index.add(["index.json", "duplicates.json"]) 5540 self.repo.index.commit("Initial thicket repository structure") 5541 5542 def _save_index(self, index: GitStoreIndex) -> None: 5543 """Save the index to index.json.""" 5544 index_path = self.repo_path / "index.json" 5545 with open(index_path, "w") as f: 5546 json.dump(index.model_dump(mode="json", exclude_none=True), f, indent=2, default=str) 5547 5548 def _load_index(self) -> GitStoreIndex: 5549 """Load the index from index.json.""" 5550 index_path = self.repo_path / "index.json" 5551 if not index_path.exists(): 5552 return GitStoreIndex( 5553 created=datetime.now(), 5554 last_updated=datetime.now(), 5555 ) 5556 5557 with open(index_path) as f: 5558 data = json.load(f) 5559 5560 return GitStoreIndex(**data) 5561 5562 def _save_duplicates(self, duplicates: DuplicateMap) -> None: 5563 """Save duplicates map to duplicates.json.""" 5564 duplicates_path = self.repo_path / "duplicates.json" 5565 with open(duplicates_path, "w") as f: 5566 json.dump(duplicates.model_dump(exclude_none=True), f, indent=2) 5567 5568 def _load_duplicates(self) -> DuplicateMap: 5569 """Load duplicates map from duplicates.json.""" 5570 duplicates_path = self.repo_path / "duplicates.json" 5571 if not duplicates_path.exists(): 5572 return DuplicateMap() 5573 5574 with open(duplicates_path) as f: 5575 data = json.load(f) 5576 5577 return DuplicateMap(**data) 5578 5579 def add_user(self, username: str, display_name: Optional[str] = None, 5580 email: Optional[str] = None, homepage: Optional[str] = None, 5581 icon: Optional[str] = None, feeds: Optional[list[str]] = None) -> UserMetadata: 5582 """Add a new user to the Git store.""" 5583 index = self._load_index() 5584 5585 # Create user directory 5586 user_dir = self.repo_path / username 5587 user_dir.mkdir(exist_ok=True) 5588 5589 # Create user metadata 5590 user_metadata = UserMetadata( 5591 username=username, 5592 display_name=display_name, 5593 email=email, 5594 homepage=homepage, 5595 icon=icon, 5596 feeds=feeds or [], 5597 directory=username, 5598 created=datetime.now(), 5599 last_updated=datetime.now(), 5600 ) 5601 5602 5603 # Update index 5604 index.add_user(user_metadata) 5605 self._save_index(index) 5606 5607 return user_metadata 5608 5609 def get_user(self, username: str) -> Optional[UserMetadata]: 5610 """Get user metadata by username.""" 5611 index = self._load_index() 5612 return index.get_user(username) 5613 5614 def update_user(self, username: str, **kwargs) -> bool: 5615 """Update user metadata.""" 5616 index = self._load_index() 5617 user = index.get_user(username) 5618 5619 if not user: 5620 return False 5621 5622 # Update user metadata 5623 for key, value in kwargs.items(): 5624 if hasattr(user, key) and value is not None: 5625 setattr(user, key, value) 5626 5627 user.update_timestamp() 5628 5629 5630 # Update index 5631 index.add_user(user) 5632 self._save_index(index) 5633 5634 return True 5635 5636 def store_entry(self, username: str, entry: AtomEntry) -> bool: 5637 """Store an entry in the user's directory.""" 5638 user = self.get_user(username) 5639 if not user: 5640 return False 5641 5642 # Sanitize entry ID for filename 5643 from .feed_parser import FeedParser 5644 parser = FeedParser() 5645 safe_id = parser.sanitize_entry_id(entry.id) 5646 5647 # Create entry file 5648 user_dir = self.repo_path / user.directory 5649 entry_path = user_dir / f"{safe_id}.json" 5650 5651 # Check if entry already exists 5652 entry_exists = entry_path.exists() 5653 5654 # Save entry 5655 with open(entry_path, "w") as f: 5656 json.dump(entry.model_dump(mode="json", exclude_none=True), f, indent=2, default=str) 5657 5658 # Update user metadata if new entry 5659 if not entry_exists: 5660 index = self._load_index() 5661 index.update_entry_count(username, 1) 5662 self._save_index(index) 5663 5664 return True 5665 5666 def get_entry(self, username: str, entry_id: str) -> Optional[AtomEntry]: 5667 """Get an entry by username and entry ID.""" 5668 user = self.get_user(username) 5669 if not user: 5670 return None 5671 5672 # Sanitize entry ID 5673 from .feed_parser import FeedParser 5674 parser = FeedParser() 5675 safe_id = parser.sanitize_entry_id(entry_id) 5676 5677 entry_path = self.repo_path / user.directory / f"{safe_id}.json" 5678 if not entry_path.exists(): 5679 return None 5680 5681 with open(entry_path) as f: 5682 data = json.load(f) 5683 5684 return AtomEntry(**data) 5685 5686 def list_entries(self, username: str, limit: Optional[int] = None) -> list[AtomEntry]: 5687 """List entries for a user.""" 5688 user = self.get_user(username) 5689 if not user: 5690 return [] 5691 5692 user_dir = self.repo_path / user.directory 5693 if not user_dir.exists(): 5694 return [] 5695 5696 entries = [] 5697 entry_files = sorted(user_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True) 5698 5699 5700 if limit: 5701 entry_files = entry_files[:limit] 5702 5703 for entry_file in entry_files: 5704 try: 5705 with open(entry_file) as f: 5706 data = json.load(f) 5707 entries.append(AtomEntry(**data)) 5708 except Exception: 5709 # Skip invalid entries 5710 continue 5711 5712 return entries 5713 5714 def get_duplicates(self) -> DuplicateMap: 5715 """Get the duplicates map.""" 5716 return self._load_duplicates() 5717 5718 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None: 5719 """Add a duplicate mapping.""" 5720 duplicates = self._load_duplicates() 5721 duplicates.add_duplicate(duplicate_id, canonical_id) 5722 self._save_duplicates(duplicates) 5723 5724 def remove_duplicate(self, duplicate_id: str) -> bool: 5725 """Remove a duplicate mapping.""" 5726 duplicates = self._load_duplicates() 5727 result = duplicates.remove_duplicate(duplicate_id) 5728 self._save_duplicates(duplicates) 5729 return result 5730 5731 def commit_changes(self, message: str) -> None: 5732 """Commit all changes to the Git repository.""" 5733 if not self.repo: 5734 return 5735 5736 # Add all changes 5737 self.repo.git.add(A=True) 5738 5739 # Check if there are changes to commit 5740 if self.repo.index.diff("HEAD"): 5741 self.repo.index.commit(message) 5742 5743 def get_stats(self) -> dict: 5744 """Get statistics about the Git store.""" 5745 index = self._load_index() 5746 duplicates = self._load_duplicates() 5747 5748 return { 5749 "total_users": len(index.users), 5750 "total_entries": index.total_entries, 5751 "total_duplicates": len(duplicates.duplicates), 5752 "last_updated": index.last_updated, 5753 "repository_size": sum(f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()), 5754 } 5755 5756 def search_entries(self, query: str, username: Optional[str] = None, 5757 limit: Optional[int] = None) -> list[tuple[str, AtomEntry]]: 5758 """Search entries by content.""" 5759 results = [] 5760 5761 # Get users to search 5762 index = self._load_index() 5763 users = [index.get_user(username)] if username else list(index.users.values()) 5764 users = [u for u in users if u is not None] 5765 5766 for user in users: 5767 user_dir = self.repo_path / user.directory 5768 if not user_dir.exists(): 5769 continue 5770 5771 entry_files = user_dir.glob("*.json") 5772 5773 for entry_file in entry_files: 5774 try: 5775 with open(entry_file) as f: 5776 data = json.load(f) 5777 5778 entry = AtomEntry(**data) 5779 5780 # Simple text search in title, summary, and content 5781 searchable_text = " ".join(filter(None, [ 5782 entry.title, 5783 entry.summary or "", 5784 entry.content or "", 5785 ])).lower() 5786 5787 if query.lower() in searchable_text: 5788 results.append((user.username, entry)) 5789 5790 if limit and len(results) >= limit: 5791 return results 5792 5793 except Exception: 5794 # Skip invalid entries 5795 continue 5796 5797 # Sort by updated time (newest first) 5798 results.sort(key=lambda x: x[1].updated, reverse=True) 5799 5800 return results[:limit] if limit else results 5801</file> 5802 5803<file path="ARCH.md"> 5804# Thicket Architecture Design 5805 5806## Overview 5807Thicket is a modern CLI tool for persisting Atom/RSS feeds in a Git repository, designed to enable distributed webblog comment structures. 5808 5809## Technology Stack 5810 5811### Core Libraries 5812 5813#### CLI Framework 5814- **Typer** (0.15.x) - Modern CLI framework with type hints 5815- **Rich** (13.x) - Beautiful terminal output, progress bars, and tables 5816- **prompt-toolkit** - Interactive prompts when needed 5817 5818#### Feed Processing 5819- **feedparser** (6.0.11) - Universal feed parser supporting RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 5820 - Alternative: **atoma** for stricter Atom/RSS parsing with JSON feed support 5821 - Alternative: **fastfeedparser** for high-performance parsing (10x faster) 5822 5823#### Git Integration 5824- **GitPython** (3.1.44) - High-level git operations, requires git CLI 5825 - Alternative: **pygit2** (1.18.0) - Direct libgit2 bindings, better for authentication 5826 5827#### HTTP Client 5828- **httpx** (0.28.x) - Modern async/sync HTTP client with connection pooling 5829- **aiohttp** (3.11.x) - For async-only operations if needed 5830 5831#### Configuration & Data Models 5832- **pydantic** (2.11.x) - Data validation and settings management 5833- **pydantic-settings** (2.10.x) - Configuration file handling with env var support 5834 5835#### Utilities 5836- **pendulum** (3.x) - Better datetime handling 5837- **bleach** (6.x) - HTML sanitization for feed content 5838- **platformdirs** (4.x) - Cross-platform directory paths 5839 5840## Project Structure 5841 5842``` 5843thicket/ 5844├── pyproject.toml # Modern Python packaging 5845├── README.md # Project documentation 5846├── ARCH.md # This file 5847├── CLAUDE.md # Project instructions 5848├── .gitignore 5849├── src/ 5850│ └── thicket/ 5851│ ├── __init__.py 5852│ ├── __main__.py # Entry point for `python -m thicket` 5853│ ├── cli/ # CLI commands and interface 5854│ │ ├── __init__.py 5855│ │ ├── main.py # Main CLI app with Typer 5856│ │ ├── commands/ # Subcommands 5857│ │ │ ├── __init__.py 5858│ │ │ ├── init.py # Initialize git store 5859│ │ │ ├── add.py # Add users and feeds 5860│ │ │ ├── sync.py # Sync feeds 5861│ │ │ ├── list_cmd.py # List users/feeds 5862│ │ │ ├── duplicates.py # Manage duplicate entries 5863│ │ │ ├── links_cmd.py # Extract and categorize links 5864│ │ │ └── index_cmd.py # Build reference index and show threads 5865│ │ └── utils.py # CLI utilities (progress, formatting) 5866│ ├── core/ # Core business logic 5867│ │ ├── __init__.py 5868│ │ ├── feed_parser.py # Feed parsing and normalization 5869│ │ ├── git_store.py # Git repository operations 5870│ │ └── reference_parser.py # Link extraction and threading 5871│ ├── models/ # Pydantic data models 5872│ │ ├── __init__.py 5873│ │ ├── config.py # Configuration models 5874│ │ ├── feed.py # Feed/Entry models 5875│ │ └── user.py # User metadata models 5876│ └── utils/ # Shared utilities 5877│ └── __init__.py 5878├── tests/ 5879│ ├── __init__.py 5880│ ├── conftest.py # pytest configuration 5881│ ├── test_feed_parser.py 5882│ ├── test_git_store.py 5883│ └── fixtures/ # Test data 5884│ └── feeds/ 5885└── docs/ 5886 └── examples/ # Example configurations 5887``` 5888 5889## Data Models 5890 5891### Configuration File (YAML/TOML) 5892```python 5893class ThicketConfig(BaseSettings): 5894 git_store: Path # Git repository location 5895 cache_dir: Path # Cache directory 5896 users: list[UserConfig] 5897 5898 model_config = SettingsConfigDict( 5899 env_prefix="THICKET_", 5900 env_file=".env", 5901 yaml_file="thicket.yaml" 5902 ) 5903 5904class UserConfig(BaseModel): 5905 username: str 5906 feeds: list[HttpUrl] 5907 email: Optional[EmailStr] = None 5908 homepage: Optional[HttpUrl] = None 5909 icon: Optional[HttpUrl] = None 5910 display_name: Optional[str] = None 5911``` 5912 5913### Feed Storage Format 5914```python 5915class AtomEntry(BaseModel): 5916 id: str # Original Atom ID 5917 title: str 5918 link: HttpUrl 5919 updated: datetime 5920 published: Optional[datetime] 5921 summary: Optional[str] 5922 content: Optional[str] # Full body content from Atom entry 5923 content_type: Optional[str] = "html" # text, html, xhtml 5924 author: Optional[dict] 5925 categories: list[str] = [] 5926 rights: Optional[str] = None # Copyright info 5927 source: Optional[str] = None # Source feed URL 5928 # Additional Atom fields preserved during RSS->Atom conversion 5929 5930 model_config = ConfigDict( 5931 json_encoders={ 5932 datetime: lambda v: v.isoformat() 5933 } 5934 ) 5935 5936class DuplicateMap(BaseModel): 5937 """Maps duplicate entry IDs to canonical entry IDs""" 5938 duplicates: dict[str, str] = {} # duplicate_id -> canonical_id 5939 comment: str = "Entry IDs that map to the same canonical content" 5940 5941 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None: 5942 """Add a duplicate mapping""" 5943 self.duplicates[duplicate_id] = canonical_id 5944 5945 def remove_duplicate(self, duplicate_id: str) -> bool: 5946 """Remove a duplicate mapping. Returns True if existed.""" 5947 return self.duplicates.pop(duplicate_id, None) is not None 5948 5949 def get_canonical(self, entry_id: str) -> str: 5950 """Get canonical ID for an entry (returns original if not duplicate)""" 5951 return self.duplicates.get(entry_id, entry_id) 5952 5953 def is_duplicate(self, entry_id: str) -> bool: 5954 """Check if entry ID is marked as duplicate""" 5955 return entry_id in self.duplicates 5956``` 5957 5958## Git Repository Structure 5959``` 5960git-store/ 5961├── index.json # User directory index 5962├── duplicates.json # Manual curation of duplicate entries 5963├── links.json # Unified links, references, and mapping data 5964├── user1/ 5965│ ├── entry_id_1.json # Sanitized entry files 5966│ ├── entry_id_2.json 5967│ └── ... 5968└── user2/ 5969 └── ... 5970``` 5971 5972## Key Design Decisions 5973 5974### 1. Feed Normalization & Auto-Discovery 5975- All RSS feeds converted to Atom format before storage 5976- Preserves maximum metadata during conversion 5977- Sanitizes HTML content to prevent XSS 5978- **Auto-discovery**: Extracts user metadata from feed during `add user` command 5979 5980### 2. ID Sanitization 5981- Consistent algorithm to convert Atom IDs to safe filenames 5982- Handles edge cases (very long IDs, special characters) 5983- Maintains reversibility where possible 5984 5985### 3. Git Operations 5986- Uses GitPython for simplicity (no authentication required) 5987- Single main branch for all users and entries 5988- Atomic commits per sync operation 5989- Meaningful commit messages with feed update summaries 5990- Preserves complete history - never delete entries even if they disappear from feeds 5991 5992### 4. Caching Strategy 5993- HTTP caching with Last-Modified/ETag support 5994- Local cache of parsed feeds with TTL 5995- Cache invalidation on configuration changes 5996- Git store serves as permanent historical archive beyond feed depth limits 5997 5998### 5. Error Handling 5999- Graceful handling of feed parsing errors 6000- Retry logic for network failures 6001- Clear error messages with recovery suggestions 6002 6003## CLI Command Structure 6004 6005```bash 6006# Initialize a new git store 6007thicket init /path/to/store 6008 6009# Add a user with feeds (auto-discovers metadata from feed) 6010thicket add user "alyssa" \ 6011 --feed "https://example.com/feed.atom" 6012 # Auto-populates: email, homepage, icon, display_name from feed metadata 6013 6014# Add a user with manual overrides 6015thicket add user "alyssa" \ 6016 --feed "https://example.com/feed.atom" \ 6017 --email "alyssa@example.com" \ 6018 --homepage "https://alyssa.example.com" \ 6019 --icon "https://example.com/avatar.png" \ 6020 --display-name "Alyssa P. Hacker" 6021 6022# Add additional feed to existing user 6023thicket add feed "alyssa" "https://example.com/other-feed.rss" 6024 6025# Sync all feeds (designed for cron usage) 6026thicket sync --all 6027 6028# Sync specific user 6029thicket sync --user alyssa 6030 6031# List users and their feeds 6032thicket list users 6033thicket list feeds --user alyssa 6034 6035# Manage duplicate entries 6036thicket duplicates list 6037thicket duplicates add <entry_id_1> <entry_id_2> # Mark as duplicates 6038thicket duplicates remove <entry_id_1> <entry_id_2> # Unmark duplicates 6039 6040# Link processing and threading 6041thicket links --verbose # Extract and categorize all links 6042thicket index --verbose # Build reference index for threading 6043thicket threads # Show conversation threads 6044thicket threads --username user1 # Show threads for specific user 6045thicket threads --min-size 3 # Show threads with minimum size 6046``` 6047 6048## Performance Considerations 6049 60501. **Concurrent Feed Fetching**: Use httpx with asyncio for parallel downloads 60512. **Incremental Updates**: Only fetch/parse feeds that have changed 60523. **Efficient Git Operations**: Batch commits, use shallow clones where appropriate 60534. **Progress Feedback**: Rich progress bars for long operations 6054 6055## Security Considerations 6056 60571. **HTML Sanitization**: Use bleach to clean feed content 60582. **URL Validation**: Strict validation of feed URLs 60593. **Git Security**: No credentials stored in repository 60604. **Path Traversal**: Careful sanitization of filenames 6061 6062## Future Enhancements 6063 60641. **Web Interface**: Optional web UI for browsing the git store 60652. **Webhooks**: Notify external services on feed updates 60663. **Feed Discovery**: Auto-discover feeds from HTML pages 60674. **Export Formats**: Generate static sites, OPML exports 60685. **Federation**: P2P sync between thicket instances 6069 6070## Requirements Clarification 6071 6072**✓ Resolved Requirements:** 60731. **Feed Update Frequency**: Designed for cron usage - no built-in scheduling needed 60742. **Duplicate Handling**: Manual curation via `duplicates.json` file with CLI commands 60753. **Git Branching**: Single main branch for all users and entries 60764. **Authentication**: No feeds require authentication currently 60775. **Content Storage**: Store complete Atom entry body content as provided 60786. **Deleted Entries**: Preserve all entries in Git store permanently (historical archive) 60797. **History Depth**: Git store maintains full history beyond feed depth limits 60808. **Feed Auto-Discovery**: Extract user metadata from feed during `add user` command 6081 6082## Duplicate Entry Management 6083 6084### Duplicate Detection Strategy 6085- **Manual Curation**: Duplicates identified and managed manually via CLI 6086- **Storage**: `duplicates.json` file in Git root maps entry IDs to canonical entries 6087- **Structure**: `{"duplicate_id": "canonical_id", ...}` 6088- **CLI Commands**: Add/remove duplicate mappings with validation 6089- **Query Resolution**: Search/list commands resolve duplicates to canonical entries 6090 6091### Duplicate File Format 6092```json 6093{ 6094 "https://example.com/feed/entry/123": "https://canonical.com/posts/same-post", 6095 "https://mirror.com/articles/456": "https://canonical.com/posts/same-post", 6096 "comment": "Entry IDs that map to the same canonical content" 6097} 6098``` 6099 6100## Feed Metadata Auto-Discovery 6101 6102### Extraction Strategy 6103When adding a new user with `thicket add user`, the system fetches and parses the feed to extract: 6104 6105- **Display Name**: From `feed.title` or `feed.author.name` 6106- **Email**: From `feed.author.email` or `feed.managingEditor` 6107- **Homepage**: From `feed.link` or `feed.author.uri` 6108- **Icon**: From `feed.logo`, `feed.icon`, or `feed.image.url` 6109 6110### Discovery Priority Order 61111. **Author Information**: Prefer `feed.author.*` fields (more specific to person) 61122. **Feed-Level**: Fall back to feed-level metadata 61133. **Manual Override**: CLI flags always take precedence over discovered values 61144. **Update Behavior**: Auto-discovery only runs during initial `add user`, not on sync 6115 6116### Extracted Metadata Format 6117```python 6118class FeedMetadata(BaseModel): 6119 title: Optional[str] = None 6120 author_name: Optional[str] = None 6121 author_email: Optional[EmailStr] = None 6122 author_uri: Optional[HttpUrl] = None 6123 link: Optional[HttpUrl] = None 6124 logo: Optional[HttpUrl] = None 6125 icon: Optional[HttpUrl] = None 6126 image_url: Optional[HttpUrl] = None 6127 6128 def to_user_config(self, username: str, feed_url: HttpUrl) -> UserConfig: 6129 """Convert discovered metadata to UserConfig with fallbacks""" 6130 return UserConfig( 6131 username=username, 6132 feeds=[feed_url], 6133 display_name=self.author_name or self.title, 6134 email=self.author_email, 6135 homepage=self.author_uri or self.link, 6136 icon=self.logo or self.icon or self.image_url 6137 ) 6138``` 6139 6140## Link Processing and Threading Architecture 6141 6142### Overview 6143The thicket system implements a sophisticated link processing and threading system to create email-style threaded views of blog entries by tracking cross-references between different blogs. 6144 6145### Link Processing Pipeline 6146 6147#### 1. Link Extraction (`thicket links`) 6148The `links` command systematically extracts all outbound links from blog entries and categorizes them: 6149 6150```python 6151class LinkData(BaseModel): 6152 url: str # Fully resolved URL 6153 entry_id: str # Source entry ID 6154 username: str # Source username 6155 context: str # Surrounding text context 6156 category: str # "internal", "user", or "unknown" 6157 target_username: Optional[str] # Target user if applicable 6158``` 6159 6160**Link Categories:** 6161- **Internal**: Links to the same user's domain (self-references) 6162- **User**: Links to other tracked users' domains 6163- **Unknown**: Links to external sites not tracked by thicket 6164 6165#### 2. URL Resolution 6166All links are properly resolved using the Atom feed's base URL to handle: 6167- Relative URLs (converted to absolute) 6168- Protocol-relative URLs 6169- Fragment identifiers 6170- Redirects and canonical URLs 6171 6172#### 3. Domain Mapping 6173The system builds a comprehensive domain mapping from user configuration: 6174- Feed URLs → domain extraction 6175- Homepage URLs → domain extraction 6176- Reverse mapping: domain → username 6177 6178### Threading System 6179 6180#### 1. Reference Index Generation (`thicket index`) 6181Creates a bidirectional reference index from the categorized links: 6182 6183```python 6184class BlogReference(BaseModel): 6185 source_entry_id: str 6186 source_username: str 6187 target_url: str 6188 target_username: Optional[str] 6189 target_entry_id: Optional[str] 6190 context: str 6191``` 6192 6193#### 2. Thread Detection Algorithm 6194Uses graph traversal to find connected blog entries: 6195- **Outbound references**: Links from an entry to other entries 6196- **Inbound references**: Links to an entry from other entries 6197- **Thread members**: All entries connected through references 6198 6199#### 3. Threading Display (`thicket threads`) 6200Creates email-style threaded views: 6201- Chronological ordering within threads 6202- Reference counts (outbound/inbound) 6203- Context preservation 6204- Filtering options (user, entry, minimum size) 6205 6206### Data Structures 6207 6208#### links.json Format (Unified Structure) 6209```json 6210{ 6211 "links": { 6212 "https://example.com/post/123": { 6213 "referencing_entries": ["https://blog.user.com/entry/456"], 6214 "target_username": "user2" 6215 }, 6216 "https://external-site.com/article": { 6217 "referencing_entries": ["https://blog.user.com/entry/789"] 6218 } 6219 }, 6220 "reverse_mapping": { 6221 "https://blog.user.com/entry/456": ["https://example.com/post/123"], 6222 "https://blog.user.com/entry/789": ["https://external-site.com/article"] 6223 }, 6224 "references": [ 6225 { 6226 "source_entry_id": "https://blog.user.com/entry/456", 6227 "source_username": "user1", 6228 "target_url": "https://example.com/post/123", 6229 "target_username": "user2", 6230 "target_entry_id": "https://example.com/post/123", 6231 "context": "As mentioned in this post..." 6232 } 6233 ], 6234 "user_domains": { 6235 "user1": ["blog.user.com"], 6236 "user2": ["example.com"] 6237 } 6238} 6239``` 6240 6241This unified structure eliminates duplication by: 6242- Storing each URL only once with minimal metadata 6243- Including all link data, reference data, and mappings in one file 6244- Using presence of `target_username` to identify tracked vs external links 6245- Providing bidirectional mappings for efficient queries 6246 6247### Unified Structure Benefits 6248 6249- **Eliminates Duplication**: Each URL appears only once with metadata 6250- **Single Source of Truth**: All link-related data in one file 6251- **Efficient Queries**: Fast lookups for both directions (URL→entries, entry→URLs) 6252- **Atomic Updates**: All link data changes together 6253- **Reduced I/O**: Fewer file operations 6254 6255### Implementation Benefits 6256 62571. **Systematic Link Processing**: All links are extracted and categorized consistently 62582. **Proper URL Resolution**: Handles relative URLs and base URL resolution correctly 62593. **Domain-based Categorization**: Automatically identifies user-to-user references 62604. **Bidirectional Indexing**: Supports both "who links to whom" and "who is linked by whom" 62615. **Thread Discovery**: Finds conversation threads automatically 62626. **Rich Context**: Preserves surrounding text for each link 62637. **Performance**: Pre-computed indexes for fast threading queries 6264 6265### CLI Commands 6266 6267```bash 6268# Extract and categorize all links 6269thicket links --verbose 6270 6271# Build reference index for threading 6272thicket index --verbose 6273 6274# Show all conversation threads 6275thicket threads 6276 6277# Show threads for specific user 6278thicket threads --username user1 6279 6280# Show threads with minimum size 6281thicket threads --min-size 3 6282``` 6283 6284### Integration with Existing Commands 6285 6286The link processing system integrates seamlessly with existing thicket commands: 6287- `thicket sync` updates entries, requiring `thicket links` to be run afterward 6288- `thicket index` uses the output from `thicket links` for improved accuracy 6289- `thicket threads` provides the user-facing threading interface 6290 6291## Current Implementation Status 6292 6293### ✅ Completed Features 62941. **Core Infrastructure** 6295 - Modern CLI with Typer and Rich 6296 - Pydantic data models for type safety 6297 - Git repository operations with GitPython 6298 - Feed parsing and normalization with feedparser 6299 63002. **User and Feed Management** 6301 - `thicket init` - Initialize git store 6302 - `thicket add` - Add users and feeds with auto-discovery 6303 - `thicket sync` - Sync feeds with progress tracking 6304 - `thicket list` - List users, feeds, and entries 6305 - `thicket duplicates` - Manage duplicate entries 6306 63073. **Link Processing and Threading** 6308 - `thicket links` - Extract and categorize all outbound links 6309 - `thicket index` - Build reference index from links 6310 - `thicket threads` - Display threaded conversation views 6311 - Proper URL resolution with base URL handling 6312 - Domain-based link categorization 6313 - Context preservation for links 6314 6315### 📊 System Performance 6316- **Link Extraction**: Successfully processes thousands of blog entries 6317- **Categorization**: Identifies internal, user, and unknown links 6318- **Threading**: Creates email-style threaded views of conversations 6319- **Storage**: Efficient JSON-based data structures for links and references 6320 6321### 🔧 Current Architecture Highlights 6322- **Modular Design**: Clear separation between CLI, core logic, and models 6323- **Type Safety**: Comprehensive Pydantic models for data validation 6324- **Rich CLI**: Beautiful progress bars, tables, and error handling 6325- **Extensible**: Easy to add new commands and features 6326- **Git Integration**: All data stored in version-controlled JSON files 6327 6328### 🎯 Proven Functionality 6329The system has been tested with real blog data and successfully: 6330- Extracted 14,396 total links from blog entries 6331- Categorized 3,994 internal links, 363 user-to-user links, and 10,039 unknown links 6332- Built comprehensive domain mappings for 16 users across 20 domains 6333- Generated threaded views showing blog conversation patterns 6334 6335### 🚀 Ready for Use 6336The thicket system is now fully functional for: 6337- Maintaining Git repositories of blog feeds 6338- Tracking cross-references between blogs 6339- Creating threaded views of blog conversations 6340- Discovering blog interaction patterns 6341- Building distributed comment systems 6342</file> 6343 6344<file path="src/thicket/cli/utils.py"> 6345"""CLI utilities and helpers.""" 6346 6347from pathlib import Path 6348from typing import Optional 6349 6350import typer 6351from rich.console import Console 6352from rich.progress import Progress, SpinnerColumn, TextColumn 6353from rich.table import Table 6354 6355from ..models import ThicketConfig, UserMetadata 6356from ..core.git_store import GitStore 6357 6358console = Console() 6359 6360 6361def get_tsv_mode() -> bool: 6362 """Get the global TSV mode setting.""" 6363 from .main import tsv_mode 6364 return tsv_mode 6365 6366 6367def load_config(config_path: Optional[Path] = None) -> ThicketConfig: 6368 """Load thicket configuration from file or environment.""" 6369 if config_path and config_path.exists(): 6370 import yaml 6371 6372 with open(config_path) as f: 6373 config_data = yaml.safe_load(f) 6374 6375 # Convert to ThicketConfig 6376 return ThicketConfig(**config_data) 6377 6378 # Try to load from default locations or environment 6379 try: 6380 # First try to find thicket.yaml in current directory 6381 default_config = Path("thicket.yaml") 6382 if default_config.exists(): 6383 import yaml 6384 with open(default_config) as f: 6385 config_data = yaml.safe_load(f) 6386 return ThicketConfig(**config_data) 6387 6388 # Fall back to environment variables 6389 return ThicketConfig() 6390 except Exception as e: 6391 console.print(f"[red]Error loading configuration: {e}[/red]") 6392 console.print("[yellow]Run 'thicket init' to create a new configuration.[/yellow]") 6393 raise typer.Exit(1) from e 6394 6395 6396def save_config(config: ThicketConfig, config_path: Path) -> None: 6397 """Save thicket configuration to file.""" 6398 import yaml 6399 6400 config_data = config.model_dump(mode="json", exclude_none=True) 6401 6402 # Convert Path objects to strings for YAML serialization 6403 config_data["git_store"] = str(config_data["git_store"]) 6404 config_data["cache_dir"] = str(config_data["cache_dir"]) 6405 6406 with open(config_path, "w") as f: 6407 yaml.dump(config_data, f, default_flow_style=False, sort_keys=False) 6408 6409 6410def create_progress() -> Progress: 6411 """Create a Rich progress display.""" 6412 return Progress( 6413 SpinnerColumn(), 6414 TextColumn("[progress.description]{task.description}"), 6415 console=console, 6416 transient=True, 6417 ) 6418 6419 6420def print_users_table(config: ThicketConfig) -> None: 6421 """Print a table of users and their feeds.""" 6422 if get_tsv_mode(): 6423 print_users_tsv(config) 6424 return 6425 6426 table = Table(title="Users and Feeds") 6427 table.add_column("Username", style="cyan", no_wrap=True) 6428 table.add_column("Display Name", style="magenta") 6429 table.add_column("Email", style="blue") 6430 table.add_column("Homepage", style="green") 6431 table.add_column("Feeds", style="yellow") 6432 6433 for user in config.users: 6434 feeds_str = "\n".join(str(feed) for feed in user.feeds) 6435 table.add_row( 6436 user.username, 6437 user.display_name or "", 6438 user.email or "", 6439 str(user.homepage) if user.homepage else "", 6440 feeds_str, 6441 ) 6442 6443 console.print(table) 6444 6445 6446def print_feeds_table(config: ThicketConfig, username: Optional[str] = None) -> None: 6447 """Print a table of feeds, optionally filtered by username.""" 6448 if get_tsv_mode(): 6449 print_feeds_tsv(config, username) 6450 return 6451 6452 table = Table(title=f"Feeds{f' for {username}' if username else ''}") 6453 table.add_column("Username", style="cyan", no_wrap=True) 6454 table.add_column("Feed URL", style="blue") 6455 table.add_column("Status", style="green") 6456 6457 users = [config.find_user(username)] if username else config.users 6458 users = [u for u in users if u is not None] 6459 6460 for user in users: 6461 for feed in user.feeds: 6462 table.add_row( 6463 user.username, 6464 str(feed), 6465 "Active", # TODO: Add actual status checking 6466 ) 6467 6468 console.print(table) 6469 6470 6471def confirm_action(message: str, default: bool = False) -> bool: 6472 """Prompt for confirmation.""" 6473 return typer.confirm(message, default=default) 6474 6475 6476def print_success(message: str) -> None: 6477 """Print a success message.""" 6478 console.print(f"[green]✓[/green] {message}") 6479 6480 6481def print_error(message: str) -> None: 6482 """Print an error message.""" 6483 console.print(f"[red]✗[/red] {message}") 6484 6485 6486def print_warning(message: str) -> None: 6487 """Print a warning message.""" 6488 console.print(f"[yellow]⚠[/yellow] {message}") 6489 6490 6491def print_info(message: str) -> None: 6492 """Print an info message.""" 6493 console.print(f"[blue]ℹ[/blue] {message}") 6494 6495 6496def print_users_table_from_git(users: list[UserMetadata]) -> None: 6497 """Print a table of users from git repository.""" 6498 if get_tsv_mode(): 6499 print_users_tsv_from_git(users) 6500 return 6501 6502 table = Table(title="Users and Feeds") 6503 table.add_column("Username", style="cyan", no_wrap=True) 6504 table.add_column("Display Name", style="magenta") 6505 table.add_column("Email", style="blue") 6506 table.add_column("Homepage", style="green") 6507 table.add_column("Feeds", style="yellow") 6508 6509 for user in users: 6510 feeds_str = "\n".join(user.feeds) 6511 table.add_row( 6512 user.username, 6513 user.display_name or "", 6514 user.email or "", 6515 user.homepage or "", 6516 feeds_str, 6517 ) 6518 6519 console.print(table) 6520 6521 6522def print_feeds_table_from_git(git_store: GitStore, username: Optional[str] = None) -> None: 6523 """Print a table of feeds from git repository.""" 6524 if get_tsv_mode(): 6525 print_feeds_tsv_from_git(git_store, username) 6526 return 6527 6528 table = Table(title=f"Feeds{f' for {username}' if username else ''}") 6529 table.add_column("Username", style="cyan", no_wrap=True) 6530 table.add_column("Feed URL", style="blue") 6531 table.add_column("Status", style="green") 6532 6533 if username: 6534 user = git_store.get_user(username) 6535 users = [user] if user else [] 6536 else: 6537 index = git_store._load_index() 6538 users = list(index.users.values()) 6539 6540 for user in users: 6541 for feed in user.feeds: 6542 table.add_row( 6543 user.username, 6544 feed, 6545 "Active", # TODO: Add actual status checking 6546 ) 6547 6548 console.print(table) 6549 6550 6551def print_users_tsv(config: ThicketConfig) -> None: 6552 """Print users in TSV format.""" 6553 print("Username\tDisplay Name\tEmail\tHomepage\tFeeds") 6554 for user in config.users: 6555 feeds_str = ",".join(str(feed) for feed in user.feeds) 6556 print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}") 6557 6558 6559def print_users_tsv_from_git(users: list[UserMetadata]) -> None: 6560 """Print users from git repository in TSV format.""" 6561 print("Username\tDisplay Name\tEmail\tHomepage\tFeeds") 6562 for user in users: 6563 feeds_str = ",".join(user.feeds) 6564 print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}") 6565 6566 6567def print_feeds_tsv(config: ThicketConfig, username: Optional[str] = None) -> None: 6568 """Print feeds in TSV format.""" 6569 print("Username\tFeed URL\tStatus") 6570 users = [config.find_user(username)] if username else config.users 6571 users = [u for u in users if u is not None] 6572 6573 for user in users: 6574 for feed in user.feeds: 6575 print(f"{user.username}\t{feed}\tActive") 6576 6577 6578def print_feeds_tsv_from_git(git_store: GitStore, username: Optional[str] = None) -> None: 6579 """Print feeds from git repository in TSV format.""" 6580 print("Username\tFeed URL\tStatus") 6581 6582 if username: 6583 user = git_store.get_user(username) 6584 users = [user] if user else [] 6585 else: 6586 index = git_store._load_index() 6587 users = list(index.users.values()) 6588 6589 for user in users: 6590 for feed in user.feeds: 6591 print(f"{user.username}\t{feed}\tActive") 6592 6593 6594def print_entries_tsv(entries_by_user: list[list], usernames: list[str]) -> None: 6595 """Print entries in TSV format.""" 6596 print("User\tAtom ID\tTitle\tUpdated\tURL") 6597 6598 # Combine all entries with usernames 6599 all_entries = [] 6600 for entries, username in zip(entries_by_user, usernames): 6601 for entry in entries: 6602 all_entries.append((username, entry)) 6603 6604 # Sort by updated time (newest first) 6605 all_entries.sort(key=lambda x: x[1].updated, reverse=True) 6606 6607 for username, entry in all_entries: 6608 # Format updated time 6609 updated_str = entry.updated.strftime("%Y-%m-%d %H:%M") 6610 6611 # Escape tabs and newlines in title to preserve TSV format 6612 title = entry.title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') 6613 6614 print(f"{username}\t{entry.id}\t{title}\t{updated_str}\t{entry.link}") 6615</file> 6616 6617</files>