Manage Atom feeds in a persistent git repository
1This file is a merged representation of the entire codebase, combined into a single document by Repomix.
2
3<file_summary>
4This section contains a summary of this file.
5
6<purpose>
7This file contains a packed representation of the entire repository's contents.
8It is designed to be easily consumable by AI systems for analysis, code review,
9or other automated processes.
10</purpose>
11
12<file_format>
13The content is organized as follows:
141. This summary section
152. Repository information
163. Directory structure
174. Repository files (if enabled)
185. Multiple file entries, each consisting of:
19 - File path as an attribute
20 - Full contents of the file
21</file_format>
22
23<usage_guidelines>
24- This file should be treated as read-only. Any changes should be made to the
25 original repository files, not this packed version.
26- When processing this file, use the file path to distinguish
27 between different files in the repository.
28- Be aware that this file may contain sensitive information. Handle it with
29 the same level of security as you would the original repository.
30</usage_guidelines>
31
32<notes>
33- Some files may have been excluded based on .gitignore rules and Repomix's configuration
34- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
35- Files matching patterns in .gitignore are excluded
36- Files matching default ignore patterns are excluded
37- Files are sorted by Git change count (files with more changes are at the bottom)
38</notes>
39
40</file_summary>
41
42<directory_structure>
43.claude/
44 settings.local.json
45src/
46 thicket/
47 cli/
48 commands/
49 __init__.py
50 add.py
51 duplicates.py
52 generate.py
53 index_cmd.py
54 info_cmd.py
55 init.py
56 links_cmd.py
57 list_cmd.py
58 sync.py
59 __init__.py
60 main.py
61 utils.py
62 core/
63 __init__.py
64 feed_parser.py
65 git_store.py
66 reference_parser.py
67 models/
68 __init__.py
69 config.py
70 feed.py
71 user.py
72 templates/
73 base.html
74 index.html
75 links.html
76 script.js
77 style.css
78 timeline.html
79 users.html
80 utils/
81 __init__.py
82 __init__.py
83 __main__.py
84.gitignore
85ARCH.md
86CLAUDE.md
87pyproject.toml
88README.md
89</directory_structure>
90
91<files>
92This section contains the contents of the repository's files.
93
94<file path=".claude/settings.local.json">
95{
96 "permissions": {
97 "allow": [
98 "Bash(find:*)",
99 "Bash(uv run:*)",
100 "Bash(grep:*)",
101 "Bash(jq:*)",
102 "Bash(git add:*)",
103 "Bash(ls:*)"
104 ]
105 },
106 "enableAllProjectMcpServers": false
107}
108</file>
109
110<file path="src/thicket/cli/commands/generate.py">
111"""Generate static HTML website from thicket data."""
112
113import base64
114import json
115import re
116import shutil
117from datetime import datetime
118from pathlib import Path
119from typing import Any, Optional, TypedDict, Union
120
121import typer
122from jinja2 import Environment, FileSystemLoader, select_autoescape
123from rich.progress import Progress, SpinnerColumn, TextColumn
124
125from ...core.git_store import GitStore
126from ...models.feed import AtomEntry
127from ...models.user import GitStoreIndex, UserMetadata
128from ..main import app
129from ..utils import console, load_config
130
131
132class UserData(TypedDict):
133 """Type definition for user data structure."""
134
135 metadata: UserMetadata
136 recent_entries: list[tuple[str, AtomEntry]]
137
138
139def safe_anchor_id(atom_id: str) -> str:
140 """Convert an Atom ID to a safe HTML anchor ID."""
141 # Use base64 URL-safe encoding without padding
142 encoded = base64.urlsafe_b64encode(atom_id.encode('utf-8')).decode('ascii').rstrip('=')
143 # Prefix with 'id' to ensure it starts with a letter (HTML requirement)
144 return f"id{encoded}"
145
146
147class WebsiteGenerator:
148 """Generate static HTML website from thicket data."""
149
150 def __init__(self, git_store: GitStore, output_dir: Path):
151 self.git_store = git_store
152 self.output_dir = output_dir
153 self.template_dir = Path(__file__).parent.parent.parent / "templates"
154
155 # Initialize Jinja2 environment
156 self.env = Environment(
157 loader=FileSystemLoader(self.template_dir),
158 autoescape=select_autoescape(["html", "xml"]),
159 )
160
161 # Data containers
162 self.index: Optional[GitStoreIndex] = None
163 self.entries: list[tuple[str, AtomEntry]] = [] # (username, entry)
164 self.links_data: Optional[dict[str, Any]] = None
165 self.threads: list[list[dict[str, Any]]] = [] # List of threads with metadata
166
167 def get_display_name(self, username: str) -> str:
168 """Get display name for a user, falling back to username."""
169 if self.index and username in self.index.users:
170 user = self.index.users[username]
171 return user.display_name or username
172 return username
173
174 def get_user_homepage(self, username: str) -> Optional[str]:
175 """Get homepage URL for a user."""
176 if self.index and username in self.index.users:
177 user = self.index.users[username]
178 return str(user.homepage) if user.homepage else None
179 return None
180
181 def clean_html_summary(self, content: Optional[str], max_length: int = 200) -> str:
182 """Clean HTML content and truncate for display in timeline."""
183 if not content:
184 return ""
185
186 # Remove HTML tags
187 clean_text = re.sub(r"<[^>]+>", " ", content)
188 # Replace multiple whitespace with single space
189 clean_text = re.sub(r"\s+", " ", clean_text)
190 # Strip leading/trailing whitespace
191 clean_text = clean_text.strip()
192
193 # Truncate with ellipsis if needed
194 if len(clean_text) > max_length:
195 # Try to break at word boundary
196 truncated = clean_text[:max_length]
197 last_space = truncated.rfind(" ")
198 if (
199 last_space > max_length * 0.8
200 ): # If we can break reasonably close to the limit
201 clean_text = truncated[:last_space] + "..."
202 else:
203 clean_text = truncated + "..."
204
205 return clean_text
206
207 def load_data(self) -> None:
208 """Load all data from the git repository."""
209 with Progress(
210 SpinnerColumn(),
211 TextColumn("[progress.description]{task.description}"),
212 console=console,
213 ) as progress:
214 # Load index
215 task = progress.add_task("Loading repository index...", total=None)
216 self.index = self.git_store._load_index()
217 if not self.index:
218 raise ValueError("No index found in repository")
219 progress.update(task, completed=True)
220
221 # Load all entries
222 task = progress.add_task("Loading entries...", total=None)
223 for username, user_metadata in self.index.users.items():
224 user_dir = self.git_store.repo_path / user_metadata.directory
225 if user_dir.exists():
226 for entry_file in user_dir.glob("*.json"):
227 if entry_file.name not in ["index.json", "duplicates.json"]:
228 try:
229 with open(entry_file) as f:
230 entry_data = json.load(f)
231 entry = AtomEntry(**entry_data)
232 self.entries.append((username, entry))
233 except Exception as e:
234 console.print(
235 f"[yellow]Warning: Failed to load {entry_file}: {e}[/yellow]"
236 )
237 progress.update(task, completed=True)
238
239 # Sort entries by date (newest first) - prioritize updated over published
240 self.entries.sort(
241 key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
242 )
243
244 # Load links data
245 task = progress.add_task("Loading links and references...", total=None)
246 links_file = self.git_store.repo_path / "links.json"
247 if links_file.exists():
248 with open(links_file) as f:
249 self.links_data = json.load(f)
250 progress.update(task, completed=True)
251
252 def build_threads(self) -> None:
253 """Build threaded conversations from references."""
254 if not self.links_data or "references" not in self.links_data:
255 return
256
257 # Map entry IDs to (username, entry) tuples
258 entry_map: dict[str, tuple[str, AtomEntry]] = {}
259 for username, entry in self.entries:
260 entry_map[entry.id] = (username, entry)
261
262 # Build adjacency lists for references
263 self.outbound_refs: dict[str, set[str]] = {}
264 self.inbound_refs: dict[str, set[str]] = {}
265 self.reference_details: dict[
266 str, list[dict[str, Any]]
267 ] = {} # Store full reference info
268
269 for ref in self.links_data["references"]:
270 source_id = ref["source_entry_id"]
271 target_id = ref.get("target_entry_id")
272
273 if target_id and source_id in entry_map and target_id in entry_map:
274 self.outbound_refs.setdefault(source_id, set()).add(target_id)
275 self.inbound_refs.setdefault(target_id, set()).add(source_id)
276
277 # Store reference details for UI
278 self.reference_details.setdefault(source_id, []).append(
279 {
280 "target_id": target_id,
281 "target_username": ref.get("target_username"),
282 "type": "outbound",
283 }
284 )
285 self.reference_details.setdefault(target_id, []).append(
286 {
287 "source_id": source_id,
288 "source_username": ref.get("source_username"),
289 "type": "inbound",
290 }
291 )
292
293 # Find conversation threads (multi-post discussions)
294 processed = set()
295
296 for entry_id, (_username, _entry) in entry_map.items():
297 if entry_id in processed:
298 continue
299
300 # Build thread starting from this entry
301 thread = []
302 to_visit = [entry_id]
303 thread_ids = set()
304 level_map: dict[str, int] = {} # Track levels for this thread
305
306 # First, traverse up to find the root
307 current = entry_id
308 while current in self.inbound_refs:
309 parents = self.inbound_refs[current] - {
310 current
311 } # Exclude self-references
312 if not parents:
313 break
314 # Take the first parent
315 parent = next(iter(parents))
316 if parent in thread_ids: # Avoid cycles
317 break
318 current = parent
319 to_visit.insert(0, current)
320
321 # Now traverse down from the root
322 while to_visit:
323 current = to_visit.pop(0)
324 if current in thread_ids or current not in entry_map:
325 continue
326
327 thread_ids.add(current)
328 username, entry = entry_map[current]
329
330 # Calculate thread level
331 thread_level = self._calculate_thread_level(current, level_map)
332
333 # Add threading metadata
334 thread_entry = {
335 "username": username,
336 "display_name": self.get_display_name(username),
337 "entry": entry,
338 "entry_id": current,
339 "references_to": list(self.outbound_refs.get(current, [])),
340 "referenced_by": list(self.inbound_refs.get(current, [])),
341 "thread_level": thread_level,
342 }
343 thread.append(thread_entry)
344 processed.add(current)
345
346 # Add children
347 if current in self.outbound_refs:
348 children = self.outbound_refs[current] - thread_ids # Avoid cycles
349 to_visit.extend(sorted(children))
350
351 if len(thread) > 1: # Only keep actual threads
352 # Sort thread by date (newest first) - prioritize updated over published
353 thread.sort(key=lambda x: x["entry"].updated or x["entry"].published or datetime.min, reverse=True) # type: ignore
354 self.threads.append(thread)
355
356 # Sort threads by the date of their most recent entry - prioritize updated over published
357 self.threads.sort(
358 key=lambda t: max(
359 item["entry"].updated or item["entry"].published or datetime.min for item in t
360 ),
361 reverse=True,
362 )
363
364 def _calculate_thread_level(
365 self, entry_id: str, processed_entries: dict[str, int]
366 ) -> int:
367 """Calculate indentation level for threaded display."""
368 if entry_id in processed_entries:
369 return processed_entries[entry_id]
370
371 if entry_id not in self.inbound_refs:
372 processed_entries[entry_id] = 0
373 return 0
374
375 parents_in_thread = self.inbound_refs[entry_id] & set(processed_entries.keys())
376 if not parents_in_thread:
377 processed_entries[entry_id] = 0
378 return 0
379
380 # Find the deepest parent level + 1
381 max_parent_level = 0
382 for parent_id in parents_in_thread:
383 parent_level = self._calculate_thread_level(parent_id, processed_entries)
384 max_parent_level = max(max_parent_level, parent_level)
385
386 level = min(max_parent_level + 1, 4) # Cap at level 4
387 processed_entries[entry_id] = level
388 return level
389
390 def get_standalone_references(self) -> list[dict[str, Any]]:
391 """Get posts that have references but aren't part of multi-post threads."""
392 if not hasattr(self, "reference_details"):
393 return []
394
395 threaded_entry_ids = set()
396 for thread in self.threads:
397 for item in thread:
398 threaded_entry_ids.add(item["entry_id"])
399
400 standalone_refs = []
401 for username, entry in self.entries:
402 if (
403 entry.id in self.reference_details
404 and entry.id not in threaded_entry_ids
405 ):
406 refs = self.reference_details[entry.id]
407 # Only include if it has meaningful references (not just self-references)
408 meaningful_refs = [
409 r
410 for r in refs
411 if r.get("target_id") != entry.id and r.get("source_id") != entry.id
412 ]
413 if meaningful_refs:
414 standalone_refs.append(
415 {
416 "username": username,
417 "display_name": self.get_display_name(username),
418 "entry": entry,
419 "references": meaningful_refs,
420 }
421 )
422
423 return standalone_refs
424
425 def _add_cross_thread_links(self, timeline_items: list[dict[str, Any]]) -> None:
426 """Add cross-thread linking for entries that appear in multiple threads."""
427 # Map entry IDs to their positions in the timeline
428 entry_positions: dict[str, list[int]] = {}
429 # Map URLs referenced by entries to the entries that reference them
430 url_references: dict[str, list[tuple[str, int]]] = {} # url -> [(entry_id, position)]
431
432 # First pass: collect all entry IDs, their positions, and referenced URLs
433 for i, item in enumerate(timeline_items):
434 if item["type"] == "post":
435 entry_id = item["content"]["entry"].id
436 entry_positions.setdefault(entry_id, []).append(i)
437 # Track URLs this entry references
438 if entry_id in self.reference_details:
439 for ref in self.reference_details[entry_id]:
440 if ref["type"] == "outbound" and "target_id" in ref:
441 # Find the target entry's URL if available
442 target_entry = self._find_entry_by_id(ref["target_id"])
443 if target_entry and target_entry.link:
444 url = str(target_entry.link)
445 url_references.setdefault(url, []).append((entry_id, i))
446 elif item["type"] == "thread":
447 for thread_item in item["content"]:
448 entry_id = thread_item["entry"].id
449 entry_positions.setdefault(entry_id, []).append(i)
450 # Track URLs this entry references
451 if entry_id in self.reference_details:
452 for ref in self.reference_details[entry_id]:
453 if ref["type"] == "outbound" and "target_id" in ref:
454 target_entry = self._find_entry_by_id(ref["target_id"])
455 if target_entry and target_entry.link:
456 url = str(target_entry.link)
457 url_references.setdefault(url, []).append((entry_id, i))
458
459 # Build cross-thread connections - only for entries that actually appear multiple times
460 cross_thread_connections: dict[str, set[int]] = {} # entry_id -> set of timeline positions
461
462 # Add connections ONLY for entries that appear multiple times in the timeline
463 for entry_id, positions in entry_positions.items():
464 if len(positions) > 1:
465 cross_thread_connections[entry_id] = set(positions)
466 # Debug: uncomment to see which entries have multiple appearances
467 # print(f"Entry {entry_id[:50]}... appears at positions: {positions}")
468
469 # Apply cross-thread links to timeline items
470 for entry_id, positions_set in cross_thread_connections.items():
471 positions_list = list(positions_set)
472 for pos in positions_list:
473 item = timeline_items[pos]
474 other_positions = sorted([p for p in positions_list if p != pos])
475
476 if item["type"] == "post":
477 # Add cross-thread info to individual posts
478 item["content"]["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
479 # Add info about shared references
480 item["content"]["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
481 elif item["type"] == "thread":
482 # Add cross-thread info to thread items
483 for thread_item in item["content"]:
484 if thread_item["entry"].id == entry_id:
485 thread_item["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
486 thread_item["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
487 break
488
489 def _build_cross_thread_link_data(self, entry_id: str, other_positions: list[int], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
490 """Build detailed cross-thread link data with anchor information."""
491 cross_thread_links = []
492
493 for pos in other_positions:
494 item = timeline_items[pos]
495 if item["type"] == "post":
496 # For individual posts
497 safe_id = safe_anchor_id(entry_id)
498 cross_thread_links.append({
499 "position": pos,
500 "anchor_id": f"post-{pos}-{safe_id}",
501 "context": "individual post",
502 "title": item["content"]["entry"].title
503 })
504 elif item["type"] == "thread":
505 # For thread items, find the specific thread item
506 for thread_idx, thread_item in enumerate(item["content"]):
507 if thread_item["entry"].id == entry_id:
508 safe_id = safe_anchor_id(entry_id)
509 cross_thread_links.append({
510 "position": pos,
511 "anchor_id": f"post-{pos}-{thread_idx}-{safe_id}",
512 "context": f"thread (level {thread_item.get('thread_level', 0)})",
513 "title": thread_item["entry"].title
514 })
515 break
516
517 return cross_thread_links
518
519 def _find_entry_by_id(self, entry_id: str) -> Optional[AtomEntry]:
520 """Find an entry by its ID."""
521 for _username, entry in self.entries:
522 if entry.id == entry_id:
523 return entry
524 return None
525
526 def _get_shared_references(self, entry_id: str, positions: Union[set[int], list[int]], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
527 """Get information about shared references between cross-thread entries."""
528 shared_refs = []
529
530 # Collect all referenced URLs from entries at these positions
531 url_counts: dict[str, int] = {}
532 referencing_entries: dict[str, list[str]] = {} # url -> [entry_ids]
533
534 for pos in positions:
535 item = timeline_items[pos]
536 entries_to_check = []
537
538 if item["type"] == "post":
539 entries_to_check.append(item["content"]["entry"])
540 elif item["type"] == "thread":
541 entries_to_check.extend([ti["entry"] for ti in item["content"]])
542
543 for entry in entries_to_check:
544 if entry.id in self.reference_details:
545 for ref in self.reference_details[entry.id]:
546 if ref["type"] == "outbound" and "target_id" in ref:
547 target_entry = self._find_entry_by_id(ref["target_id"])
548 if target_entry and target_entry.link:
549 url = str(target_entry.link)
550 url_counts[url] = url_counts.get(url, 0) + 1
551 if url not in referencing_entries:
552 referencing_entries[url] = []
553 if entry.id not in referencing_entries[url]:
554 referencing_entries[url].append(entry.id)
555
556 # Find URLs referenced by multiple entries
557 for url, count in url_counts.items():
558 if count > 1 and len(referencing_entries[url]) > 1:
559 # Get the target entry info
560 target_entry = None
561 target_username = None
562 for ref in (self.links_data or {}).get("references", []):
563 if ref.get("target_url") == url:
564 target_username = ref.get("target_username")
565 if ref.get("target_entry_id"):
566 target_entry = self._find_entry_by_id(ref["target_entry_id"])
567 break
568
569 shared_refs.append({
570 "url": url,
571 "count": count,
572 "referencing_entries": referencing_entries[url],
573 "target_username": target_username,
574 "target_title": target_entry.title if target_entry else None
575 })
576
577 return sorted(shared_refs, key=lambda x: x["count"], reverse=True)
578
579 def generate_site(self) -> None:
580 """Generate the static website."""
581 # Create output directory
582 self.output_dir.mkdir(parents=True, exist_ok=True)
583
584 # Create static directories
585 (self.output_dir / "css").mkdir(exist_ok=True)
586 (self.output_dir / "js").mkdir(exist_ok=True)
587
588 # Generate CSS
589 css_template = self.env.get_template("style.css")
590 css_content = css_template.render()
591 with open(self.output_dir / "css" / "style.css", "w") as f:
592 f.write(css_content)
593
594 # Generate JavaScript
595 js_template = self.env.get_template("script.js")
596 js_content = js_template.render()
597 with open(self.output_dir / "js" / "script.js", "w") as f:
598 f.write(js_content)
599
600 # Prepare common template data
601 base_data = {
602 "title": "Energy & Environment Group",
603 "generated_at": datetime.now().isoformat(),
604 "get_display_name": self.get_display_name,
605 "get_user_homepage": self.get_user_homepage,
606 "clean_html_summary": self.clean_html_summary,
607 "safe_anchor_id": safe_anchor_id,
608 }
609
610 # Build unified timeline
611 timeline_items = []
612
613 # Only consider the threads that will actually be displayed
614 displayed_threads = self.threads[:20] # Limit to 20 threads
615
616 # Track which entries are part of displayed threads
617 threaded_entry_ids = set()
618 for thread in displayed_threads:
619 for item in thread:
620 threaded_entry_ids.add(item["entry_id"])
621
622 # Add threads to timeline (using the date of the most recent post)
623 for thread in displayed_threads:
624 most_recent_date = max(
625 item["entry"].updated or item["entry"].published or datetime.min
626 for item in thread
627 )
628 timeline_items.append({
629 "type": "thread",
630 "date": most_recent_date,
631 "content": thread
632 })
633
634 # Add individual posts (not in threads)
635 for username, entry in self.entries[:50]:
636 if entry.id not in threaded_entry_ids:
637 # Check if this entry has references
638 has_refs = (
639 entry.id in self.reference_details
640 if hasattr(self, "reference_details")
641 else False
642 )
643
644 refs = []
645 if has_refs:
646 refs = self.reference_details.get(entry.id, [])
647 refs = [
648 r for r in refs
649 if r.get("target_id") != entry.id
650 and r.get("source_id") != entry.id
651 ]
652
653 timeline_items.append({
654 "type": "post",
655 "date": entry.updated or entry.published or datetime.min,
656 "content": {
657 "username": username,
658 "display_name": self.get_display_name(username),
659 "entry": entry,
660 "references": refs if refs else None
661 }
662 })
663
664 # Sort unified timeline by date (newest first)
665 timeline_items.sort(key=lambda x: x["date"], reverse=True)
666
667 # Limit timeline to what will actually be rendered
668 timeline_items = timeline_items[:50] # Limit to 50 items total
669
670 # Add cross-thread linking for repeat blog references
671 self._add_cross_thread_links(timeline_items)
672
673 # Prepare outgoing links data
674 outgoing_links = []
675 if self.links_data and "links" in self.links_data:
676 for url, link_info in self.links_data["links"].items():
677 referencing_entries = []
678 for entry_id in link_info.get("referencing_entries", []):
679 for username, entry in self.entries:
680 if entry.id == entry_id:
681 referencing_entries.append(
682 (self.get_display_name(username), entry)
683 )
684 break
685
686 if referencing_entries:
687 # Sort by date - prioritize updated over published
688 referencing_entries.sort(
689 key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
690 )
691 outgoing_links.append(
692 {
693 "url": url,
694 "target_username": link_info.get("target_username"),
695 "entries": referencing_entries,
696 }
697 )
698
699 # Sort links by most recent reference - prioritize updated over published
700 outgoing_links.sort(
701 key=lambda x: x["entries"][0][1].updated
702 or x["entries"][0][1].published or datetime.min,
703 reverse=True,
704 )
705
706 # Prepare users data
707 users: list[UserData] = []
708 if self.index:
709 for username, user_metadata in self.index.users.items():
710 # Get recent entries for this user with display names
711 user_entries = [
712 (self.get_display_name(u), e)
713 for u, e in self.entries
714 if u == username
715 ][:5]
716 users.append(
717 {"metadata": user_metadata, "recent_entries": user_entries}
718 )
719 # Sort by entry count
720 users.sort(key=lambda x: x["metadata"].entry_count, reverse=True)
721
722 # Generate timeline page
723 timeline_template = self.env.get_template("timeline.html")
724 timeline_content = timeline_template.render(
725 **base_data,
726 page="timeline",
727 timeline_items=timeline_items, # Already limited above
728 )
729 with open(self.output_dir / "timeline.html", "w") as f:
730 f.write(timeline_content)
731
732 # Generate links page
733 links_template = self.env.get_template("links.html")
734 links_content = links_template.render(
735 **base_data,
736 page="links",
737 outgoing_links=outgoing_links[:100],
738 )
739 with open(self.output_dir / "links.html", "w") as f:
740 f.write(links_content)
741
742 # Generate users page
743 users_template = self.env.get_template("users.html")
744 users_content = users_template.render(
745 **base_data,
746 page="users",
747 users=users,
748 )
749 with open(self.output_dir / "users.html", "w") as f:
750 f.write(users_content)
751
752 # Generate main index page (redirect to timeline)
753 index_template = self.env.get_template("index.html")
754 index_content = index_template.render(**base_data)
755 with open(self.output_dir / "index.html", "w") as f:
756 f.write(index_content)
757
758 console.print(f"[green]✓[/green] Generated website at {self.output_dir}")
759 console.print(f" - {len(self.entries)} entries")
760 console.print(f" - {len(self.threads)} conversation threads")
761 console.print(f" - {len(outgoing_links)} outgoing links")
762 console.print(f" - {len(users)} users")
763 console.print(
764 " - Generated pages: index.html, timeline.html, links.html, users.html"
765 )
766
767
768@app.command()
769def generate(
770 output: Path = typer.Option(
771 Path("./thicket-site"),
772 "--output",
773 "-o",
774 help="Output directory for the generated website",
775 ),
776 force: bool = typer.Option(
777 False, "--force", "-f", help="Overwrite existing output directory"
778 ),
779 config_file: Path = typer.Option(
780 Path("thicket.yaml"), "--config", help="Configuration file path"
781 ),
782) -> None:
783 """Generate a static HTML website from thicket data."""
784 config = load_config(config_file)
785
786 if not config.git_store:
787 console.print("[red]No git store path configured[/red]")
788 raise typer.Exit(1)
789
790 git_store = GitStore(config.git_store)
791
792 # Check if output directory exists
793 if output.exists() and not force:
794 console.print(
795 f"[red]Output directory {output} already exists. Use --force to overwrite.[/red]"
796 )
797 raise typer.Exit(1)
798
799 # Clean output directory if forcing
800 if output.exists() and force:
801 shutil.rmtree(output)
802
803 try:
804 generator = WebsiteGenerator(git_store, output)
805
806 console.print("[bold]Generating static website...[/bold]")
807 generator.load_data()
808 generator.build_threads()
809 generator.generate_site()
810
811 except Exception as e:
812 console.print(f"[red]Error generating website: {e}[/red]")
813 raise typer.Exit(1) from e
814</file>
815
816<file path="src/thicket/templates/base.html">
817<!DOCTYPE html>
818<html lang="en">
819<head>
820 <meta charset="UTF-8">
821 <meta name="viewport" content="width=device-width, initial-scale=1.0">
822 <title>{% block page_title %}{{ title }}{% endblock %}</title>
823 <link rel="stylesheet" href="css/style.css">
824</head>
825<body>
826 <header class="site-header">
827 <div class="header-content">
828 <h1 class="site-title">{{ title }}</h1>
829 <nav class="site-nav">
830 <a href="timeline.html" class="nav-link {% if page == 'timeline' %}active{% endif %}">Timeline</a>
831 <a href="links.html" class="nav-link {% if page == 'links' %}active{% endif %}">Links</a>
832 <a href="users.html" class="nav-link {% if page == 'users' %}active{% endif %}">Users</a>
833 </nav>
834 </div>
835 </header>
836
837 <main class="main-content">
838 {% block content %}{% endblock %}
839 </main>
840
841 <footer class="site-footer">
842 <p>Generated on {{ generated_at }} by <a href="https://github.com/avsm/thicket">Thicket</a></p>
843 </footer>
844
845 <script src="js/script.js"></script>
846</body>
847</html>
848</file>
849
850<file path="src/thicket/templates/index.html">
851<!DOCTYPE html>
852<html lang="en">
853<head>
854 <meta charset="UTF-8">
855 <meta name="viewport" content="width=device-width, initial-scale=1.0">
856 <title>{{ title }}</title>
857 <meta http-equiv="refresh" content="0; url=timeline.html">
858 <link rel="canonical" href="timeline.html">
859</head>
860<body>
861 <p>Redirecting to <a href="timeline.html">Timeline</a>...</p>
862</body>
863</html>
864</file>
865
866<file path="src/thicket/templates/links.html">
867{% extends "base.html" %}
868
869{% block page_title %}Outgoing Links - {{ title }}{% endblock %}
870
871{% block content %}
872<div class="page-content">
873 <h2>Outgoing Links</h2>
874 <p class="page-description">External links referenced in blog posts, ordered by most recent reference.</p>
875
876 {% for link in outgoing_links %}
877 <article class="link-group">
878 <h3 class="link-url">
879 <a href="{{ link.url }}" target="_blank">{{ link.url|truncate(80) }}</a>
880 {% if link.target_username %}
881 <span class="target-user">({{ link.target_username }})</span>
882 {% endif %}
883 </h3>
884 <div class="referencing-entries">
885 <span class="ref-count">Referenced in {{ link.entries|length }} post(s):</span>
886 <ul>
887 {% for display_name, entry in link.entries[:5] %}
888 <li>
889 <span class="author">{{ display_name }}</span> -
890 <a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
891 <time datetime="{{ entry.updated or entry.published }}">
892 ({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
893 </time>
894 </li>
895 {% endfor %}
896 {% if link.entries|length > 5 %}
897 <li class="more">... and {{ link.entries|length - 5 }} more</li>
898 {% endif %}
899 </ul>
900 </div>
901 </article>
902 {% endfor %}
903</div>
904{% endblock %}
905</file>
906
907<file path="src/thicket/templates/script.js">
908// Enhanced functionality for thicket website
909document.addEventListener('DOMContentLoaded', function() {
910
911 // Enhance thread collapsing (optional feature)
912 const threadHeaders = document.querySelectorAll('.thread-header');
913 threadHeaders.forEach(header => {
914 header.style.cursor = 'pointer';
915 header.addEventListener('click', function() {
916 const thread = this.parentElement;
917 const entries = thread.querySelectorAll('.thread-entry');
918
919 // Toggle visibility of all but the first entry
920 for (let i = 1; i < entries.length; i++) {
921 entries[i].style.display = entries[i].style.display === 'none' ? 'block' : 'none';
922 }
923
924 // Update thread count text
925 const count = this.querySelector('.thread-count');
926 if (entries[1] && entries[1].style.display === 'none') {
927 count.textContent = count.textContent.replace('posts', 'posts (collapsed)');
928 } else {
929 count.textContent = count.textContent.replace(' (collapsed)', '');
930 }
931 });
932 });
933
934 // Add relative time display
935 const timeElements = document.querySelectorAll('time');
936 timeElements.forEach(timeEl => {
937 const datetime = new Date(timeEl.getAttribute('datetime'));
938 const now = new Date();
939 const diffMs = now - datetime;
940 const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
941
942 let relativeTime;
943 if (diffDays === 0) {
944 const diffHours = Math.floor(diffMs / (1000 * 60 * 60));
945 if (diffHours === 0) {
946 const diffMinutes = Math.floor(diffMs / (1000 * 60));
947 relativeTime = diffMinutes === 0 ? 'just now' : `${diffMinutes}m ago`;
948 } else {
949 relativeTime = `${diffHours}h ago`;
950 }
951 } else if (diffDays === 1) {
952 relativeTime = 'yesterday';
953 } else if (diffDays < 7) {
954 relativeTime = `${diffDays}d ago`;
955 } else if (diffDays < 30) {
956 const weeks = Math.floor(diffDays / 7);
957 relativeTime = weeks === 1 ? '1w ago' : `${weeks}w ago`;
958 } else if (diffDays < 365) {
959 const months = Math.floor(diffDays / 30);
960 relativeTime = months === 1 ? '1mo ago' : `${months}mo ago`;
961 } else {
962 const years = Math.floor(diffDays / 365);
963 relativeTime = years === 1 ? '1y ago' : `${years}y ago`;
964 }
965
966 // Add relative time as title attribute
967 timeEl.setAttribute('title', timeEl.textContent);
968 timeEl.textContent = relativeTime;
969 });
970
971 // Enhanced anchor link scrolling for shared references
972 document.querySelectorAll('a[href^="#"]').forEach(anchor => {
973 anchor.addEventListener('click', function (e) {
974 e.preventDefault();
975 const target = document.querySelector(this.getAttribute('href'));
976 if (target) {
977 target.scrollIntoView({
978 behavior: 'smooth',
979 block: 'center'
980 });
981
982 // Highlight the target briefly
983 const timelineEntry = target.closest('.timeline-entry');
984 if (timelineEntry) {
985 timelineEntry.style.outline = '2px solid var(--primary-color)';
986 timelineEntry.style.borderRadius = '8px';
987 setTimeout(() => {
988 timelineEntry.style.outline = '';
989 timelineEntry.style.borderRadius = '';
990 }, 2000);
991 }
992 }
993 });
994 });
995});
996</file>
997
998<file path="src/thicket/templates/style.css">
999/* Modern, clean design with high-density text and readable theme */
1000
1001:root {
1002 --primary-color: #2c3e50;
1003 --secondary-color: #3498db;
1004 --accent-color: #e74c3c;
1005 --background: #ffffff;
1006 --surface: #f8f9fa;
1007 --text-primary: #2c3e50;
1008 --text-secondary: #7f8c8d;
1009 --border-color: #e0e0e0;
1010 --thread-indent: 20px;
1011 --max-width: 1200px;
1012}
1013
1014* {
1015 margin: 0;
1016 padding: 0;
1017 box-sizing: border-box;
1018}
1019
1020body {
1021 font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif;
1022 font-size: 14px;
1023 line-height: 1.6;
1024 color: var(--text-primary);
1025 background-color: var(--background);
1026}
1027
1028/* Header */
1029.site-header {
1030 background-color: var(--surface);
1031 border-bottom: 1px solid var(--border-color);
1032 padding: 0.75rem 0;
1033 position: sticky;
1034 top: 0;
1035 z-index: 100;
1036}
1037
1038.header-content {
1039 max-width: var(--max-width);
1040 margin: 0 auto;
1041 padding: 0 2rem;
1042 display: flex;
1043 justify-content: space-between;
1044 align-items: center;
1045}
1046
1047.site-title {
1048 font-size: 1.5rem;
1049 font-weight: 600;
1050 color: var(--primary-color);
1051 margin: 0;
1052}
1053
1054/* Navigation */
1055.site-nav {
1056 display: flex;
1057 gap: 1.5rem;
1058}
1059
1060.nav-link {
1061 text-decoration: none;
1062 color: var(--text-secondary);
1063 font-weight: 500;
1064 font-size: 0.95rem;
1065 padding: 0.5rem 0.75rem;
1066 border-radius: 4px;
1067 transition: all 0.2s ease;
1068}
1069
1070.nav-link:hover {
1071 color: var(--primary-color);
1072 background-color: var(--background);
1073}
1074
1075.nav-link.active {
1076 color: var(--secondary-color);
1077 background-color: var(--background);
1078 font-weight: 600;
1079}
1080
1081/* Main Content */
1082.main-content {
1083 max-width: var(--max-width);
1084 margin: 2rem auto;
1085 padding: 0 2rem;
1086}
1087
1088.page-content {
1089 margin: 0;
1090}
1091
1092.page-description {
1093 color: var(--text-secondary);
1094 margin-bottom: 1.5rem;
1095 font-style: italic;
1096}
1097
1098/* Sections */
1099section {
1100 margin-bottom: 2rem;
1101}
1102
1103h2 {
1104 font-size: 1.3rem;
1105 font-weight: 600;
1106 margin-bottom: 0.75rem;
1107 color: var(--primary-color);
1108}
1109
1110h3 {
1111 font-size: 1.1rem;
1112 font-weight: 600;
1113 margin-bottom: 0.75rem;
1114 color: var(--primary-color);
1115}
1116
1117/* Entries and Threads */
1118article {
1119 margin-bottom: 1.5rem;
1120 padding: 1rem;
1121 background-color: var(--surface);
1122 border-radius: 4px;
1123 border: 1px solid var(--border-color);
1124}
1125
1126/* Timeline-style entries */
1127.timeline-entry {
1128 margin-bottom: 0.5rem;
1129 padding: 0.5rem 0.75rem;
1130 border: none;
1131 background: transparent;
1132 transition: background-color 0.2s ease;
1133}
1134
1135.timeline-entry:hover {
1136 background-color: var(--surface);
1137}
1138
1139.timeline-meta {
1140 display: inline-flex;
1141 gap: 0.5rem;
1142 align-items: center;
1143 font-size: 0.75rem;
1144 color: var(--text-secondary);
1145 margin-bottom: 0.25rem;
1146}
1147
1148.timeline-time {
1149 font-family: 'SF Mono', Monaco, Consolas, 'Courier New', monospace;
1150 font-size: 0.75rem;
1151 color: var(--text-secondary);
1152}
1153
1154.timeline-author {
1155 font-weight: 600;
1156 color: var(--primary-color);
1157 font-size: 0.8rem;
1158 text-decoration: none;
1159}
1160
1161.timeline-author:hover {
1162 color: var(--secondary-color);
1163 text-decoration: underline;
1164}
1165
1166.timeline-content {
1167 line-height: 1.4;
1168}
1169
1170.timeline-title {
1171 font-size: 0.95rem;
1172 font-weight: 600;
1173}
1174
1175.timeline-title a {
1176 color: var(--primary-color);
1177 text-decoration: none;
1178}
1179
1180.timeline-title a:hover {
1181 color: var(--secondary-color);
1182 text-decoration: underline;
1183}
1184
1185.timeline-summary {
1186 color: var(--text-secondary);
1187 font-size: 0.9rem;
1188 line-height: 1.4;
1189}
1190
1191/* Legacy styles for other sections */
1192.entry-meta, .thread-header {
1193 display: flex;
1194 gap: 1rem;
1195 align-items: center;
1196 margin-bottom: 0.5rem;
1197 font-size: 0.85rem;
1198 color: var(--text-secondary);
1199}
1200
1201.author {
1202 font-weight: 600;
1203 color: var(--primary-color);
1204}
1205
1206time {
1207 font-size: 0.85rem;
1208}
1209
1210h4 {
1211 font-size: 1.1rem;
1212 font-weight: 600;
1213 margin-bottom: 0.5rem;
1214}
1215
1216h4 a {
1217 color: var(--primary-color);
1218 text-decoration: none;
1219}
1220
1221h4 a:hover {
1222 color: var(--secondary-color);
1223 text-decoration: underline;
1224}
1225
1226.entry-summary {
1227 color: var(--text-primary);
1228 line-height: 1.5;
1229 margin-top: 0.5rem;
1230}
1231
1232/* Enhanced Threading Styles */
1233
1234/* Conversation Clusters */
1235.conversation-cluster {
1236 background-color: var(--background);
1237 border: 2px solid var(--border-color);
1238 border-radius: 8px;
1239 margin-bottom: 2rem;
1240 overflow: hidden;
1241 box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
1242}
1243
1244.conversation-header {
1245 background: linear-gradient(135deg, var(--surface) 0%, #f1f3f4 100%);
1246 padding: 0.75rem 1rem;
1247 border-bottom: 1px solid var(--border-color);
1248}
1249
1250.conversation-meta {
1251 display: flex;
1252 justify-content: space-between;
1253 align-items: center;
1254 flex-wrap: wrap;
1255 gap: 0.5rem;
1256}
1257
1258.conversation-count {
1259 font-weight: 600;
1260 color: var(--secondary-color);
1261 font-size: 0.9rem;
1262}
1263
1264.conversation-participants {
1265 font-size: 0.8rem;
1266 color: var(--text-secondary);
1267 flex: 1;
1268 text-align: right;
1269}
1270
1271.conversation-flow {
1272 padding: 0.5rem;
1273}
1274
1275/* Threaded Conversation Entries */
1276.conversation-entry {
1277 position: relative;
1278 margin-bottom: 0.75rem;
1279 display: flex;
1280 align-items: flex-start;
1281}
1282
1283.conversation-entry.level-0 {
1284 margin-left: 0;
1285}
1286
1287.conversation-entry.level-1 {
1288 margin-left: 1.5rem;
1289}
1290
1291.conversation-entry.level-2 {
1292 margin-left: 3rem;
1293}
1294
1295.conversation-entry.level-3 {
1296 margin-left: 4.5rem;
1297}
1298
1299.conversation-entry.level-4 {
1300 margin-left: 6rem;
1301}
1302
1303.entry-connector {
1304 width: 3px;
1305 background-color: var(--secondary-color);
1306 margin-right: 0.75rem;
1307 margin-top: 0.25rem;
1308 min-height: 2rem;
1309 border-radius: 2px;
1310 opacity: 0.6;
1311}
1312
1313.conversation-entry.level-0 .entry-connector {
1314 background-color: var(--accent-color);
1315 opacity: 0.8;
1316}
1317
1318.entry-content {
1319 flex: 1;
1320 background-color: var(--surface);
1321 padding: 0.75rem;
1322 border-radius: 6px;
1323 border: 1px solid var(--border-color);
1324 transition: all 0.2s ease;
1325}
1326
1327.entry-content:hover {
1328 border-color: var(--secondary-color);
1329 box-shadow: 0 2px 8px rgba(52, 152, 219, 0.1);
1330}
1331
1332/* Reference Indicators */
1333.reference-indicators {
1334 display: inline-flex;
1335 gap: 0.25rem;
1336 margin-left: 0.5rem;
1337}
1338
1339.ref-out, .ref-in {
1340 display: inline-block;
1341 width: 1rem;
1342 height: 1rem;
1343 border-radius: 50%;
1344 text-align: center;
1345 line-height: 1rem;
1346 font-size: 0.7rem;
1347 font-weight: bold;
1348}
1349
1350.ref-out {
1351 background-color: #e8f5e8;
1352 color: #2d8f2d;
1353}
1354
1355.ref-in {
1356 background-color: #e8f0ff;
1357 color: #1f5fbf;
1358}
1359
1360/* Reference Badges for Individual Posts */
1361.timeline-entry.with-references {
1362 background-color: var(--surface);
1363}
1364
1365/* Conversation posts in unified timeline */
1366.timeline-entry.conversation-post {
1367 background: transparent;
1368 border: none;
1369 margin-bottom: 0.5rem;
1370 padding: 0.5rem 0.75rem;
1371}
1372
1373.timeline-entry.conversation-post.level-0 {
1374 margin-left: 0;
1375 border-left: 2px solid var(--accent-color);
1376 padding-left: 0.75rem;
1377}
1378
1379.timeline-entry.conversation-post.level-1 {
1380 margin-left: 1.5rem;
1381 border-left: 2px solid var(--secondary-color);
1382 padding-left: 0.75rem;
1383}
1384
1385.timeline-entry.conversation-post.level-2 {
1386 margin-left: 3rem;
1387 border-left: 2px solid var(--text-secondary);
1388 padding-left: 0.75rem;
1389}
1390
1391.timeline-entry.conversation-post.level-3 {
1392 margin-left: 4.5rem;
1393 border-left: 2px solid var(--text-secondary);
1394 padding-left: 0.75rem;
1395}
1396
1397.timeline-entry.conversation-post.level-4 {
1398 margin-left: 6rem;
1399 border-left: 2px solid var(--text-secondary);
1400 padding-left: 0.75rem;
1401}
1402
1403/* Cross-thread linking */
1404.cross-thread-links {
1405 margin-top: 0.5rem;
1406 padding-top: 0.5rem;
1407 border-top: 1px solid var(--border-color);
1408}
1409
1410.cross-thread-indicator {
1411 font-size: 0.75rem;
1412 color: var(--text-secondary);
1413 background-color: var(--surface);
1414 padding: 0.25rem 0.5rem;
1415 border-radius: 12px;
1416 border: 1px solid var(--border-color);
1417 display: inline-block;
1418}
1419
1420/* Inline shared references styling */
1421.inline-shared-refs {
1422 margin-left: 0.5rem;
1423 font-size: 0.85rem;
1424 color: var(--text-secondary);
1425}
1426
1427.shared-ref-link {
1428 color: var(--primary-color);
1429 text-decoration: none;
1430 font-weight: 500;
1431 transition: color 0.2s ease;
1432}
1433
1434.shared-ref-link:hover {
1435 color: var(--secondary-color);
1436 text-decoration: underline;
1437}
1438
1439.shared-ref-more {
1440 font-style: italic;
1441 color: var(--text-secondary);
1442 font-size: 0.8rem;
1443 margin-left: 0.25rem;
1444}
1445
1446.user-anchor, .post-anchor {
1447 position: absolute;
1448 margin-top: -60px; /* Offset for fixed header */
1449 pointer-events: none;
1450}
1451
1452.cross-thread-link {
1453 color: var(--primary-color);
1454 text-decoration: none;
1455 font-weight: 500;
1456 transition: color 0.2s ease;
1457}
1458
1459.cross-thread-link:hover {
1460 color: var(--secondary-color);
1461 text-decoration: underline;
1462}
1463
1464.reference-badges {
1465 display: flex;
1466 gap: 0.25rem;
1467 margin-left: 0.5rem;
1468 flex-wrap: wrap;
1469}
1470
1471.ref-badge {
1472 display: inline-block;
1473 padding: 0.1rem 0.4rem;
1474 border-radius: 12px;
1475 font-size: 0.7rem;
1476 font-weight: 600;
1477 text-transform: uppercase;
1478 letter-spacing: 0.05em;
1479}
1480
1481.ref-badge.ref-outbound {
1482 background-color: #e8f5e8;
1483 color: #2d8f2d;
1484 border: 1px solid #c3e6c3;
1485}
1486
1487.ref-badge.ref-inbound {
1488 background-color: #e8f0ff;
1489 color: #1f5fbf;
1490 border: 1px solid #b3d9ff;
1491}
1492
1493/* Author Color Coding */
1494.timeline-author {
1495 position: relative;
1496}
1497
1498.timeline-author::before {
1499 content: '';
1500 display: inline-block;
1501 width: 8px;
1502 height: 8px;
1503 border-radius: 50%;
1504 margin-right: 0.5rem;
1505 background-color: var(--secondary-color);
1506}
1507
1508/* Generate consistent colors for authors */
1509.author-avsm::before { background-color: #e74c3c; }
1510.author-mort::before { background-color: #3498db; }
1511.author-mte::before { background-color: #2ecc71; }
1512.author-ryan::before { background-color: #f39c12; }
1513.author-mwd::before { background-color: #9b59b6; }
1514.author-dra::before { background-color: #1abc9c; }
1515.author-pf341::before { background-color: #34495e; }
1516.author-sadiqj::before { background-color: #e67e22; }
1517.author-martinkl::before { background-color: #8e44ad; }
1518.author-jonsterling::before { background-color: #27ae60; }
1519.author-jon::before { background-color: #f1c40f; }
1520.author-onkar::before { background-color: #e91e63; }
1521.author-gabriel::before { background-color: #00bcd4; }
1522.author-jess::before { background-color: #ff5722; }
1523.author-ibrahim::before { background-color: #607d8b; }
1524.author-andres::before { background-color: #795548; }
1525.author-eeg::before { background-color: #ff9800; }
1526
1527/* Section Headers */
1528.conversations-section h3,
1529.referenced-posts-section h3,
1530.individual-posts-section h3 {
1531 border-bottom: 2px solid var(--border-color);
1532 padding-bottom: 0.5rem;
1533 margin-bottom: 1.5rem;
1534 position: relative;
1535}
1536
1537.conversations-section h3::before {
1538 content: "💬";
1539 margin-right: 0.5rem;
1540}
1541
1542.referenced-posts-section h3::before {
1543 content: "🔗";
1544 margin-right: 0.5rem;
1545}
1546
1547.individual-posts-section h3::before {
1548 content: "📝";
1549 margin-right: 0.5rem;
1550}
1551
1552/* Legacy thread styles (for backward compatibility) */
1553.thread {
1554 background-color: var(--background);
1555 border: 1px solid var(--border-color);
1556 padding: 0;
1557 overflow: hidden;
1558 margin-bottom: 1rem;
1559}
1560
1561.thread-header {
1562 background-color: var(--surface);
1563 padding: 0.5rem 0.75rem;
1564 border-bottom: 1px solid var(--border-color);
1565}
1566
1567.thread-count {
1568 font-weight: 600;
1569 color: var(--secondary-color);
1570}
1571
1572.thread-entry {
1573 padding: 0.5rem 0.75rem;
1574 border-bottom: 1px solid var(--border-color);
1575}
1576
1577.thread-entry:last-child {
1578 border-bottom: none;
1579}
1580
1581.thread-entry.reply {
1582 margin-left: var(--thread-indent);
1583 border-left: 3px solid var(--secondary-color);
1584 background-color: var(--surface);
1585}
1586
1587/* Links Section */
1588.link-group {
1589 background-color: var(--background);
1590}
1591
1592.link-url {
1593 font-size: 1rem;
1594 word-break: break-word;
1595}
1596
1597.link-url a {
1598 color: var(--secondary-color);
1599 text-decoration: none;
1600}
1601
1602.link-url a:hover {
1603 text-decoration: underline;
1604}
1605
1606.target-user {
1607 font-size: 0.9rem;
1608 color: var(--text-secondary);
1609 font-weight: normal;
1610}
1611
1612.referencing-entries {
1613 margin-top: 0.75rem;
1614}
1615
1616.ref-count {
1617 font-weight: 600;
1618 color: var(--text-secondary);
1619 font-size: 0.9rem;
1620}
1621
1622.referencing-entries ul {
1623 list-style: none;
1624 margin-top: 0.5rem;
1625 padding-left: 1rem;
1626}
1627
1628.referencing-entries li {
1629 margin-bottom: 0.25rem;
1630 font-size: 0.9rem;
1631}
1632
1633.referencing-entries .more {
1634 font-style: italic;
1635 color: var(--text-secondary);
1636}
1637
1638/* Users Section */
1639.user-card {
1640 background-color: var(--background);
1641}
1642
1643.user-header {
1644 display: flex;
1645 gap: 1rem;
1646 align-items: start;
1647 margin-bottom: 1rem;
1648}
1649
1650.user-icon {
1651 width: 48px;
1652 height: 48px;
1653 border-radius: 50%;
1654 object-fit: cover;
1655}
1656
1657.user-info h3 {
1658 margin-bottom: 0.25rem;
1659}
1660
1661.username {
1662 font-size: 0.9rem;
1663 color: var(--text-secondary);
1664 font-weight: normal;
1665}
1666
1667.user-meta {
1668 font-size: 0.9rem;
1669 color: var(--text-secondary);
1670}
1671
1672.user-meta a {
1673 color: var(--secondary-color);
1674 text-decoration: none;
1675}
1676
1677.user-meta a:hover {
1678 text-decoration: underline;
1679}
1680
1681.separator {
1682 margin: 0 0.5rem;
1683}
1684
1685.post-count {
1686 font-weight: 600;
1687}
1688
1689.user-recent h4 {
1690 font-size: 0.95rem;
1691 margin-bottom: 0.5rem;
1692 color: var(--text-secondary);
1693}
1694
1695.user-recent ul {
1696 list-style: none;
1697 padding-left: 0;
1698}
1699
1700.user-recent li {
1701 margin-bottom: 0.25rem;
1702 font-size: 0.9rem;
1703}
1704
1705/* Footer */
1706.site-footer {
1707 max-width: var(--max-width);
1708 margin: 3rem auto 2rem;
1709 padding: 1rem 2rem;
1710 text-align: center;
1711 color: var(--text-secondary);
1712 font-size: 0.85rem;
1713 border-top: 1px solid var(--border-color);
1714}
1715
1716.site-footer a {
1717 color: var(--secondary-color);
1718 text-decoration: none;
1719}
1720
1721.site-footer a:hover {
1722 text-decoration: underline;
1723}
1724
1725/* Responsive */
1726@media (max-width: 768px) {
1727 .site-title {
1728 font-size: 1.3rem;
1729 }
1730
1731 .header-content {
1732 flex-direction: column;
1733 gap: 0.75rem;
1734 align-items: flex-start;
1735 }
1736
1737 .site-nav {
1738 gap: 1rem;
1739 }
1740
1741 .main-content {
1742 padding: 0 1rem;
1743 }
1744
1745 .thread-entry.reply {
1746 margin-left: calc(var(--thread-indent) / 2);
1747 }
1748
1749 .user-header {
1750 flex-direction: column;
1751 }
1752}
1753</file>
1754
1755<file path="src/thicket/templates/timeline.html">
1756{% extends "base.html" %}
1757
1758{% block page_title %}Timeline - {{ title }}{% endblock %}
1759
1760{% block content %}
1761{% set seen_users = [] %}
1762<div class="page-content">
1763 <h2>Recent Posts & Conversations</h2>
1764
1765 <section class="unified-timeline">
1766 {% for item in timeline_items %}
1767 {% if item.type == "post" %}
1768 <!-- Individual Post -->
1769 <article class="timeline-entry {% if item.content.references %}with-references{% endif %}">
1770 <div class="timeline-meta">
1771 <time datetime="{{ item.content.entry.updated or item.content.entry.published }}" class="timeline-time">
1772 {{ (item.content.entry.updated or item.content.entry.published).strftime('%Y-%m-%d %H:%M') }}
1773 </time>
1774 {% set homepage = get_user_homepage(item.content.username) %}
1775 {% if item.content.username not in seen_users %}
1776 <a id="{{ item.content.username }}" class="user-anchor"></a>
1777 {% set _ = seen_users.append(item.content.username) %}
1778 {% endif %}
1779 <a id="post-{{ loop.index0 }}-{{ safe_anchor_id(item.content.entry.id) }}" class="post-anchor"></a>
1780 {% if homepage %}
1781 <a href="{{ homepage }}" target="_blank" class="timeline-author">{{ item.content.display_name }}</a>
1782 {% else %}
1783 <span class="timeline-author">{{ item.content.display_name }}</span>
1784 {% endif %}
1785 {% if item.content.references %}
1786 <div class="reference-badges">
1787 {% for ref in item.content.references %}
1788 {% if ref.type == 'outbound' %}
1789 <span class="ref-badge ref-outbound" title="References {{ ref.target_username or 'external post' }}">
1790 → {{ ref.target_username or 'ext' }}
1791 </span>
1792 {% elif ref.type == 'inbound' %}
1793 <span class="ref-badge ref-inbound" title="Referenced by {{ ref.source_username or 'external post' }}">
1794 ← {{ ref.source_username or 'ext' }}
1795 </span>
1796 {% endif %}
1797 {% endfor %}
1798 </div>
1799 {% endif %}
1800 </div>
1801 <div class="timeline-content">
1802 <strong class="timeline-title">
1803 <a href="{{ item.content.entry.link }}" target="_blank">{{ item.content.entry.title }}</a>
1804 </strong>
1805 {% if item.content.entry.summary %}
1806 <span class="timeline-summary">— {{ clean_html_summary(item.content.entry.summary, 250) }}</span>
1807 {% endif %}
1808 {% if item.content.shared_references %}
1809 <span class="inline-shared-refs">
1810 {% for ref in item.content.shared_references[:3] %}
1811 {% if ref.target_username %}
1812 <a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
1813 {% endif %}
1814 {% endfor %}
1815 {% if item.content.shared_references|length > 3 %}
1816 <span class="shared-ref-more">+{{ item.content.shared_references|length - 3 }} more</span>
1817 {% endif %}
1818 </span>
1819 {% endif %}
1820 {% if item.content.cross_thread_links %}
1821 <div class="cross-thread-links">
1822 <span class="cross-thread-indicator">🔗 Also appears: </span>
1823 {% for link in item.content.cross_thread_links %}
1824 <a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
1825 {% endfor %}
1826 </div>
1827 {% endif %}
1828 </div>
1829 </article>
1830
1831 {% elif item.type == "thread" %}
1832 <!-- Conversation Thread -->
1833 {% set outer_loop_index = loop.index0 %}
1834 {% for thread_item in item.content %}
1835 <article class="timeline-entry conversation-post level-{{ thread_item.thread_level }}">
1836 <div class="timeline-meta">
1837 <time datetime="{{ thread_item.entry.updated or thread_item.entry.published }}" class="timeline-time">
1838 {{ (thread_item.entry.updated or thread_item.entry.published).strftime('%Y-%m-%d %H:%M') }}
1839 </time>
1840 {% set homepage = get_user_homepage(thread_item.username) %}
1841 {% if thread_item.username not in seen_users %}
1842 <a id="{{ thread_item.username }}" class="user-anchor"></a>
1843 {% set _ = seen_users.append(thread_item.username) %}
1844 {% endif %}
1845 <a id="post-{{ outer_loop_index }}-{{ loop.index0 }}-{{ safe_anchor_id(thread_item.entry.id) }}" class="post-anchor"></a>
1846 {% if homepage %}
1847 <a href="{{ homepage }}" target="_blank" class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</a>
1848 {% else %}
1849 <span class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</span>
1850 {% endif %}
1851 {% if thread_item.references_to or thread_item.referenced_by %}
1852 <span class="reference-indicators">
1853 {% if thread_item.references_to %}
1854 <span class="ref-out" title="References other posts">→</span>
1855 {% endif %}
1856 {% if thread_item.referenced_by %}
1857 <span class="ref-in" title="Referenced by other posts">←</span>
1858 {% endif %}
1859 </span>
1860 {% endif %}
1861 </div>
1862 <div class="timeline-content">
1863 <strong class="timeline-title">
1864 <a href="{{ thread_item.entry.link }}" target="_blank">{{ thread_item.entry.title }}</a>
1865 </strong>
1866 {% if thread_item.entry.summary %}
1867 <span class="timeline-summary">— {{ clean_html_summary(thread_item.entry.summary, 300) }}</span>
1868 {% endif %}
1869 {% if thread_item.shared_references %}
1870 <span class="inline-shared-refs">
1871 {% for ref in thread_item.shared_references[:3] %}
1872 {% if ref.target_username %}
1873 <a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
1874 {% endif %}
1875 {% endfor %}
1876 {% if thread_item.shared_references|length > 3 %}
1877 <span class="shared-ref-more">+{{ thread_item.shared_references|length - 3 }} more</span>
1878 {% endif %}
1879 </span>
1880 {% endif %}
1881 {% if thread_item.cross_thread_links %}
1882 <div class="cross-thread-links">
1883 <span class="cross-thread-indicator">🔗 Also appears: </span>
1884 {% for link in thread_item.cross_thread_links %}
1885 <a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
1886 {% endfor %}
1887 </div>
1888 {% endif %}
1889 </div>
1890 </article>
1891 {% endfor %}
1892 {% endif %}
1893 {% endfor %}
1894 </section>
1895</div>
1896{% endblock %}
1897</file>
1898
1899<file path="src/thicket/templates/users.html">
1900{% extends "base.html" %}
1901
1902{% block page_title %}Users - {{ title }}{% endblock %}
1903
1904{% block content %}
1905<div class="page-content">
1906 <h2>Users</h2>
1907 <p class="page-description">All users contributing to this thicket, ordered by post count.</p>
1908
1909 {% for user_info in users %}
1910 <article class="user-card">
1911 <div class="user-header">
1912 {% if user_info.metadata.icon and user_info.metadata.icon != "None" %}
1913 <img src="{{ user_info.metadata.icon }}" alt="{{ user_info.metadata.username }}" class="user-icon">
1914 {% endif %}
1915 <div class="user-info">
1916 <h3>
1917 {% if user_info.metadata.display_name %}
1918 {{ user_info.metadata.display_name }}
1919 <span class="username">({{ user_info.metadata.username }})</span>
1920 {% else %}
1921 {{ user_info.metadata.username }}
1922 {% endif %}
1923 </h3>
1924 <div class="user-meta">
1925 {% if user_info.metadata.homepage %}
1926 <a href="{{ user_info.metadata.homepage }}" target="_blank">{{ user_info.metadata.homepage }}</a>
1927 {% endif %}
1928 {% if user_info.metadata.email %}
1929 <span class="separator">•</span>
1930 <a href="mailto:{{ user_info.metadata.email }}">{{ user_info.metadata.email }}</a>
1931 {% endif %}
1932 <span class="separator">•</span>
1933 <span class="post-count">{{ user_info.metadata.entry_count }} posts</span>
1934 </div>
1935 </div>
1936 </div>
1937
1938 {% if user_info.recent_entries %}
1939 <div class="user-recent">
1940 <h4>Recent posts:</h4>
1941 <ul>
1942 {% for display_name, entry in user_info.recent_entries %}
1943 <li>
1944 <a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
1945 <time datetime="{{ entry.updated or entry.published }}">
1946 ({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
1947 </time>
1948 </li>
1949 {% endfor %}
1950 </ul>
1951 </div>
1952 {% endif %}
1953 </article>
1954 {% endfor %}
1955</div>
1956{% endblock %}
1957</file>
1958
1959<file path="README.md">
1960# Thicket
1961
1962A modern CLI tool for persisting Atom/RSS feeds in Git repositories, designed to enable distributed webblog comment structures.
1963
1964## Features
1965
1966- **Feed Auto-Discovery**: Automatically extracts user metadata from Atom/RSS feeds
1967- **Git Storage**: Stores feed entries in a Git repository with full history
1968- **Duplicate Management**: Manual curation of duplicate entries across feeds
1969- **Modern CLI**: Built with Typer and Rich for beautiful terminal output
1970- **Comprehensive Parsing**: Supports RSS 0.9x, RSS 1.0, RSS 2.0, and Atom feeds
1971- **Cron-Friendly**: Designed for scheduled execution
1972
1973## Installation
1974
1975```bash
1976# Install from source
1977pip install -e .
1978
1979# Or install with dev dependencies
1980pip install -e .[dev]
1981```
1982
1983## Quick Start
1984
19851. **Initialize a new thicket repository:**
1986```bash
1987thicket init ./my-feeds
1988```
1989
19902. **Add a user with their feed:**
1991```bash
1992thicket add user "alice" --feed "https://alice.example.com/feed.xml"
1993```
1994
19953. **Sync feeds to download entries:**
1996```bash
1997thicket sync --all
1998```
1999
20004. **List users and feeds:**
2001```bash
2002thicket list users
2003thicket list feeds
2004thicket list entries
2005```
2006
2007## Commands
2008
2009### Initialize
2010```bash
2011thicket init <git-store-path> [--cache-dir <path>] [--config <config-file>]
2012```
2013
2014### Add Users and Feeds
2015```bash
2016# Add user with auto-discovery
2017thicket add user "username" --feed "https://example.com/feed.xml"
2018
2019# Add user with manual metadata
2020thicket add user "username" \
2021 --feed "https://example.com/feed.xml" \
2022 --email "user@example.com" \
2023 --homepage "https://example.com" \
2024 --display-name "User Name"
2025
2026# Add additional feed to existing user
2027thicket add feed "username" "https://example.com/other-feed.xml"
2028```
2029
2030### Sync Feeds
2031```bash
2032# Sync all users
2033thicket sync --all
2034
2035# Sync specific user
2036thicket sync --user "username"
2037
2038# Dry run (preview changes)
2039thicket sync --all --dry-run
2040```
2041
2042### List Information
2043```bash
2044# List all users
2045thicket list users
2046
2047# List all feeds
2048thicket list feeds
2049
2050# List feeds for specific user
2051thicket list feeds --user "username"
2052
2053# List recent entries
2054thicket list entries --limit 20
2055
2056# List entries for specific user
2057thicket list entries --user "username"
2058```
2059
2060### Manage Duplicates
2061```bash
2062# List duplicate mappings
2063thicket duplicates list
2064
2065# Mark entries as duplicates
2066thicket duplicates add "https://example.com/dup" "https://example.com/canonical"
2067
2068# Remove duplicate mapping
2069thicket duplicates remove "https://example.com/dup"
2070```
2071
2072## Configuration
2073
2074Thicket uses a YAML configuration file (default: `thicket.yaml`):
2075
2076```yaml
2077git_store: ./feeds-repo
2078cache_dir: ~/.cache/thicket
2079users:
2080 - username: alice
2081 feeds:
2082 - https://alice.example.com/feed.xml
2083 email: alice@example.com
2084 homepage: https://alice.example.com
2085 display_name: Alice
2086```
2087
2088## Git Repository Structure
2089
2090```
2091feeds-repo/
2092├── index.json # User directory index
2093├── duplicates.json # Duplicate entry mappings
2094├── alice/
2095│ ├── metadata.json # User metadata
2096│ ├── entry_id_1.json # Feed entries
2097│ └── entry_id_2.json
2098└── bob/
2099 └── ...
2100```
2101
2102## Development
2103
2104### Setup
2105```bash
2106# Install in development mode
2107pip install -e .[dev]
2108
2109# Run tests
2110pytest
2111
2112# Run linting
2113ruff check src/
2114black --check src/
2115
2116# Run type checking
2117mypy src/
2118```
2119
2120### Architecture
2121
2122- **CLI**: Modern interface with Typer and Rich
2123- **Feed Processing**: Universal parsing with feedparser
2124- **Git Storage**: Structured storage with GitPython
2125- **Data Models**: Pydantic for validation and serialization
2126- **Async HTTP**: httpx for efficient feed fetching
2127
2128## Use Cases
2129
2130- **Blog Aggregation**: Collect and archive blog posts from multiple sources
2131- **Comment Networks**: Enable distributed commenting systems
2132- **Feed Archival**: Preserve feed history beyond typical feed depth limits
2133- **Content Curation**: Manage and deduplicate content across feeds
2134
2135## License
2136
2137MIT License - see LICENSE file for details.
2138</file>
2139
2140<file path="src/thicket/cli/commands/index_cmd.py">
2141"""CLI command for building reference index from blog entries."""
2142
2143import json
2144from pathlib import Path
2145from typing import Optional
2146
2147import typer
2148from rich.console import Console
2149from rich.progress import (
2150 BarColumn,
2151 Progress,
2152 SpinnerColumn,
2153 TaskProgressColumn,
2154 TextColumn,
2155)
2156from rich.table import Table
2157
2158from ...core.git_store import GitStore
2159from ...core.reference_parser import ReferenceIndex, ReferenceParser
2160from ..main import app
2161from ..utils import get_tsv_mode, load_config
2162
2163console = Console()
2164
2165
2166@app.command()
2167def index(
2168 config_file: Optional[Path] = typer.Option(
2169 None,
2170 "--config",
2171 "-c",
2172 help="Path to configuration file",
2173 ),
2174 output_file: Optional[Path] = typer.Option(
2175 None,
2176 "--output",
2177 "-o",
2178 help="Path to output index file (default: updates links.json in git store)",
2179 ),
2180 verbose: bool = typer.Option(
2181 False,
2182 "--verbose",
2183 "-v",
2184 help="Show detailed progress information",
2185 ),
2186) -> None:
2187 """Build a reference index showing which blog entries reference others.
2188
2189 This command analyzes all blog entries to detect cross-references between
2190 different blogs, creating an index that can be used to build threaded
2191 views of related content.
2192
2193 Updates the unified links.json file with reference data.
2194 """
2195 try:
2196 # Load configuration
2197 config = load_config(config_file)
2198
2199 # Initialize Git store
2200 git_store = GitStore(config.git_store)
2201
2202 # Initialize reference parser
2203 parser = ReferenceParser()
2204
2205 # Build user domain mapping
2206 if verbose:
2207 console.print("Building user domain mapping...")
2208 user_domains = parser.build_user_domain_mapping(git_store)
2209
2210 if verbose:
2211 console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
2212
2213 # Initialize reference index
2214 ref_index = ReferenceIndex()
2215 ref_index.user_domains = user_domains
2216
2217 # Get all users
2218 index = git_store._load_index()
2219 users = list(index.users.keys())
2220
2221 if not users:
2222 console.print("[yellow]No users found in Git store[/yellow]")
2223 raise typer.Exit(0)
2224
2225 # Process all entries
2226 total_entries = 0
2227 total_references = 0
2228 all_references = []
2229
2230 with Progress(
2231 SpinnerColumn(),
2232 TextColumn("[progress.description]{task.description}"),
2233 BarColumn(),
2234 TaskProgressColumn(),
2235 console=console,
2236 ) as progress:
2237
2238 # Count total entries first
2239 counting_task = progress.add_task("Counting entries...", total=len(users))
2240 entry_counts = {}
2241 for username in users:
2242 entries = git_store.list_entries(username)
2243 entry_counts[username] = len(entries)
2244 total_entries += len(entries)
2245 progress.advance(counting_task)
2246
2247 progress.remove_task(counting_task)
2248
2249 # Process entries - extract references
2250 processing_task = progress.add_task(
2251 f"Extracting references from {total_entries} entries...",
2252 total=total_entries
2253 )
2254
2255 for username in users:
2256 entries = git_store.list_entries(username)
2257
2258 for entry in entries:
2259 # Extract references from this entry
2260 references = parser.extract_references(entry, username, user_domains)
2261 all_references.extend(references)
2262
2263 progress.advance(processing_task)
2264
2265 if verbose and references:
2266 console.print(f" Found {len(references)} references in {username}:{entry.title[:50]}...")
2267
2268 progress.remove_task(processing_task)
2269
2270 # Resolve target_entry_ids for references
2271 if all_references:
2272 resolve_task = progress.add_task(
2273 f"Resolving {len(all_references)} references...",
2274 total=len(all_references)
2275 )
2276
2277 if verbose:
2278 console.print(f"Resolving target entry IDs for {len(all_references)} references...")
2279
2280 resolved_references = parser.resolve_target_entry_ids(all_references, git_store)
2281
2282 # Count resolved references
2283 resolved_count = sum(1 for ref in resolved_references if ref.target_entry_id is not None)
2284 if verbose:
2285 console.print(f"Resolved {resolved_count} out of {len(all_references)} references")
2286
2287 # Add resolved references to index
2288 for ref in resolved_references:
2289 ref_index.add_reference(ref)
2290 total_references += 1
2291 progress.advance(resolve_task)
2292
2293 progress.remove_task(resolve_task)
2294
2295 # Determine output path
2296 if output_file:
2297 output_path = output_file
2298 else:
2299 output_path = config.git_store / "links.json"
2300
2301 # Load existing links data or create new structure
2302 if output_path.exists() and not output_file:
2303 # Load existing unified structure
2304 with open(output_path) as f:
2305 existing_data = json.load(f)
2306 else:
2307 # Create new structure
2308 existing_data = {
2309 "links": {},
2310 "reverse_mapping": {},
2311 "user_domains": {}
2312 }
2313
2314 # Update with reference data
2315 existing_data["references"] = ref_index.to_dict()["references"]
2316 existing_data["user_domains"] = {k: list(v) for k, v in user_domains.items()}
2317
2318 # Save updated structure
2319 with open(output_path, "w") as f:
2320 json.dump(existing_data, f, indent=2, default=str)
2321
2322 # Show summary
2323 if not get_tsv_mode():
2324 console.print("\n[green]✓ Reference index built successfully[/green]")
2325
2326 # Create summary table or TSV output
2327 if get_tsv_mode():
2328 print("Metric\tCount")
2329 print(f"Total Users\t{len(users)}")
2330 print(f"Total Entries\t{total_entries}")
2331 print(f"Total References\t{total_references}")
2332 print(f"Outbound Refs\t{len(ref_index.outbound_refs)}")
2333 print(f"Inbound Refs\t{len(ref_index.inbound_refs)}")
2334 print(f"Output File\t{output_path}")
2335 else:
2336 table = Table(title="Reference Index Summary")
2337 table.add_column("Metric", style="cyan")
2338 table.add_column("Count", style="green")
2339
2340 table.add_row("Total Users", str(len(users)))
2341 table.add_row("Total Entries", str(total_entries))
2342 table.add_row("Total References", str(total_references))
2343 table.add_row("Outbound Refs", str(len(ref_index.outbound_refs)))
2344 table.add_row("Inbound Refs", str(len(ref_index.inbound_refs)))
2345 table.add_row("Output File", str(output_path))
2346
2347 console.print(table)
2348
2349 # Show some interesting statistics
2350 if total_references > 0:
2351 if not get_tsv_mode():
2352 console.print("\n[bold]Reference Statistics:[/bold]")
2353
2354 # Most referenced users
2355 target_counts = {}
2356 unresolved_domains = set()
2357
2358 for ref in ref_index.references:
2359 if ref.target_username:
2360 target_counts[ref.target_username] = target_counts.get(ref.target_username, 0) + 1
2361 else:
2362 # Track unresolved domains
2363 from urllib.parse import urlparse
2364 domain = urlparse(ref.target_url).netloc.lower()
2365 unresolved_domains.add(domain)
2366
2367 if target_counts:
2368 if get_tsv_mode():
2369 print("Referenced User\tReference Count")
2370 for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
2371 print(f"{username}\t{count}")
2372 else:
2373 console.print("\nMost referenced users:")
2374 for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
2375 console.print(f" {username}: {count} references")
2376
2377 if unresolved_domains and verbose:
2378 if get_tsv_mode():
2379 print("Unresolved Domain\tCount")
2380 for domain in sorted(list(unresolved_domains)[:10]):
2381 print(f"{domain}\t1")
2382 if len(unresolved_domains) > 10:
2383 print(f"... and {len(unresolved_domains) - 10} more\t...")
2384 else:
2385 console.print(f"\nUnresolved domains: {len(unresolved_domains)}")
2386 for domain in sorted(list(unresolved_domains)[:10]):
2387 console.print(f" {domain}")
2388 if len(unresolved_domains) > 10:
2389 console.print(f" ... and {len(unresolved_domains) - 10} more")
2390
2391 except Exception as e:
2392 console.print(f"[red]Error building reference index: {e}[/red]")
2393 if verbose:
2394 console.print_exception()
2395 raise typer.Exit(1)
2396
2397
2398@app.command()
2399def threads(
2400 config_file: Optional[Path] = typer.Option(
2401 None,
2402 "--config",
2403 "-c",
2404 help="Path to configuration file",
2405 ),
2406 index_file: Optional[Path] = typer.Option(
2407 None,
2408 "--index",
2409 "-i",
2410 help="Path to reference index file (default: links.json in git store)",
2411 ),
2412 username: Optional[str] = typer.Option(
2413 None,
2414 "--username",
2415 "-u",
2416 help="Show threads for specific username only",
2417 ),
2418 entry_id: Optional[str] = typer.Option(
2419 None,
2420 "--entry",
2421 "-e",
2422 help="Show thread for specific entry ID",
2423 ),
2424 min_size: int = typer.Option(
2425 2,
2426 "--min-size",
2427 "-m",
2428 help="Minimum thread size to display",
2429 ),
2430) -> None:
2431 """Show threaded view of related blog entries.
2432
2433 This command uses the reference index to show which blog entries
2434 are connected through cross-references, creating an email-style
2435 threaded view of the conversation.
2436
2437 Reads reference data from the unified links.json file.
2438 """
2439 try:
2440 # Load configuration
2441 config = load_config(config_file)
2442
2443 # Determine index file path
2444 if index_file:
2445 index_path = index_file
2446 else:
2447 index_path = config.git_store / "links.json"
2448
2449 if not index_path.exists():
2450 console.print(f"[red]Links file not found: {index_path}[/red]")
2451 console.print("Run 'thicket links' and 'thicket index' first to build the reference index")
2452 raise typer.Exit(1)
2453
2454 # Load unified data
2455 with open(index_path) as f:
2456 unified_data = json.load(f)
2457
2458 # Check if references exist in the unified structure
2459 if "references" not in unified_data:
2460 console.print(f"[red]No references found in {index_path}[/red]")
2461 console.print("Run 'thicket index' first to build the reference index")
2462 raise typer.Exit(1)
2463
2464 # Extract reference data and reconstruct ReferenceIndex
2465 ref_index = ReferenceIndex.from_dict({
2466 "references": unified_data["references"],
2467 "user_domains": unified_data.get("user_domains", {})
2468 })
2469
2470 # Initialize Git store to get entry details
2471 git_store = GitStore(config.git_store)
2472
2473 if entry_id and username:
2474 # Show specific thread
2475 thread_members = ref_index.get_thread_members(username, entry_id)
2476 _display_thread(thread_members, ref_index, git_store, f"Thread for {username}:{entry_id}")
2477
2478 elif username:
2479 # Show all threads involving this user
2480 user_index = git_store._load_index()
2481 user = user_index.get_user(username)
2482 if not user:
2483 console.print(f"[red]User not found: {username}[/red]")
2484 raise typer.Exit(1)
2485
2486 entries = git_store.list_entries(username)
2487 threads_found = set()
2488
2489 console.print(f"[bold]Threads involving {username}:[/bold]\n")
2490
2491 for entry in entries:
2492 thread_members = ref_index.get_thread_members(username, entry.id)
2493 if len(thread_members) >= min_size:
2494 thread_key = tuple(sorted(thread_members))
2495 if thread_key not in threads_found:
2496 threads_found.add(thread_key)
2497 _display_thread(thread_members, ref_index, git_store, f"Thread #{len(threads_found)}")
2498
2499 else:
2500 # Show all threads
2501 console.print("[bold]All conversation threads:[/bold]\n")
2502
2503 all_threads = set()
2504 processed_entries = set()
2505
2506 # Get all entries
2507 user_index = git_store._load_index()
2508 for username in user_index.users.keys():
2509 entries = git_store.list_entries(username)
2510 for entry in entries:
2511 entry_key = (username, entry.id)
2512 if entry_key in processed_entries:
2513 continue
2514
2515 thread_members = ref_index.get_thread_members(username, entry.id)
2516 if len(thread_members) >= min_size:
2517 thread_key = tuple(sorted(thread_members))
2518 if thread_key not in all_threads:
2519 all_threads.add(thread_key)
2520 _display_thread(thread_members, ref_index, git_store, f"Thread #{len(all_threads)}")
2521
2522 # Mark all members as processed
2523 for member in thread_members:
2524 processed_entries.add(member)
2525
2526 if not all_threads:
2527 console.print("[yellow]No conversation threads found[/yellow]")
2528 console.print(f"(minimum thread size: {min_size})")
2529
2530 except Exception as e:
2531 console.print(f"[red]Error showing threads: {e}[/red]")
2532 raise typer.Exit(1)
2533
2534
2535def _display_thread(thread_members, ref_index, git_store, title):
2536 """Display a single conversation thread."""
2537 console.print(f"[bold cyan]{title}[/bold cyan]")
2538 console.print(f"Thread size: {len(thread_members)} entries")
2539
2540 # Get entry details for each member
2541 thread_entries = []
2542 for username, entry_id in thread_members:
2543 entry = git_store.get_entry(username, entry_id)
2544 if entry:
2545 thread_entries.append((username, entry))
2546
2547 # Sort by publication date
2548 thread_entries.sort(key=lambda x: x[1].published or x[1].updated)
2549
2550 # Display entries
2551 for i, (username, entry) in enumerate(thread_entries):
2552 prefix = "├─" if i < len(thread_entries) - 1 else "└─"
2553
2554 # Get references for this entry
2555 outbound = ref_index.get_outbound_refs(username, entry.id)
2556 inbound = ref_index.get_inbound_refs(username, entry.id)
2557
2558 ref_info = ""
2559 if outbound or inbound:
2560 ref_info = f" ({len(outbound)} out, {len(inbound)} in)"
2561
2562 console.print(f" {prefix} [{username}] {entry.title[:60]}...{ref_info}")
2563
2564 if entry.published:
2565 console.print(f" Published: {entry.published.strftime('%Y-%m-%d')}")
2566
2567 console.print() # Empty line after each thread
2568</file>
2569
2570<file path="src/thicket/cli/commands/info_cmd.py">
2571"""CLI command for displaying detailed information about a specific atom entry."""
2572
2573import json
2574from pathlib import Path
2575from typing import Optional
2576
2577import typer
2578from rich.console import Console
2579from rich.panel import Panel
2580from rich.table import Table
2581from rich.text import Text
2582
2583from ...core.git_store import GitStore
2584from ...core.reference_parser import ReferenceIndex
2585from ..main import app
2586from ..utils import load_config, get_tsv_mode
2587
2588console = Console()
2589
2590
2591@app.command()
2592def info(
2593 identifier: str = typer.Argument(
2594 ...,
2595 help="The atom ID or URL of the entry to display information about"
2596 ),
2597 username: Optional[str] = typer.Option(
2598 None,
2599 "--username",
2600 "-u",
2601 help="Username to search for the entry (if not provided, searches all users)"
2602 ),
2603 config_file: Optional[Path] = typer.Option(
2604 Path("thicket.yaml"),
2605 "--config",
2606 "-c",
2607 help="Path to configuration file",
2608 ),
2609 show_content: bool = typer.Option(
2610 False,
2611 "--content",
2612 help="Include the full content of the entry in the output"
2613 ),
2614) -> None:
2615 """Display detailed information about a specific atom entry.
2616
2617 You can specify the entry using either its atom ID or URL.
2618 Shows all metadata for the given entry, including title, dates, categories,
2619 and summarizes all inbound and outbound links to/from other posts.
2620 """
2621 try:
2622 # Load configuration
2623 config = load_config(config_file)
2624
2625 # Initialize Git store
2626 git_store = GitStore(config.git_store)
2627
2628 # Find the entry
2629 entry = None
2630 found_username = None
2631
2632 # Check if identifier looks like a URL
2633 is_url = identifier.startswith(('http://', 'https://'))
2634
2635 if username:
2636 # Search specific username
2637 if is_url:
2638 # Search by URL
2639 entries = git_store.list_entries(username)
2640 for e in entries:
2641 if str(e.link) == identifier:
2642 entry = e
2643 found_username = username
2644 break
2645 else:
2646 # Search by atom ID
2647 entry = git_store.get_entry(username, identifier)
2648 if entry:
2649 found_username = username
2650 else:
2651 # Search all users
2652 index = git_store._load_index()
2653 for user in index.users.keys():
2654 if is_url:
2655 # Search by URL
2656 entries = git_store.list_entries(user)
2657 for e in entries:
2658 if str(e.link) == identifier:
2659 entry = e
2660 found_username = user
2661 break
2662 if entry:
2663 break
2664 else:
2665 # Search by atom ID
2666 entry = git_store.get_entry(user, identifier)
2667 if entry:
2668 found_username = user
2669 break
2670
2671 if not entry or not found_username:
2672 if username:
2673 console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found for user '{username}'[/red]")
2674 else:
2675 console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found in any user's entries[/red]")
2676 raise typer.Exit(1)
2677
2678 # Load reference index if available
2679 links_path = config.git_store / "links.json"
2680 ref_index = None
2681 if links_path.exists():
2682 with open(links_path) as f:
2683 unified_data = json.load(f)
2684
2685 # Check if references exist in the unified structure
2686 if "references" in unified_data:
2687 ref_index = ReferenceIndex.from_dict({
2688 "references": unified_data["references"],
2689 "user_domains": unified_data.get("user_domains", {})
2690 })
2691
2692 # Display information
2693 if get_tsv_mode():
2694 _display_entry_info_tsv(entry, found_username, ref_index, show_content)
2695 else:
2696 _display_entry_info(entry, found_username)
2697
2698 if ref_index:
2699 _display_link_info(entry, found_username, ref_index)
2700 else:
2701 console.print("\n[yellow]No reference index found. Run 'thicket links' and 'thicket index' to build cross-reference data.[/yellow]")
2702
2703 # Optionally display content
2704 if show_content and entry.content:
2705 _display_content(entry.content)
2706
2707 except Exception as e:
2708 console.print(f"[red]Error displaying entry info: {e}[/red]")
2709 raise typer.Exit(1)
2710
2711
2712def _display_entry_info(entry, username: str) -> None:
2713 """Display basic entry information in a structured format."""
2714
2715 # Create main info panel
2716 info_table = Table.grid(padding=(0, 2))
2717 info_table.add_column("Field", style="cyan bold", width=15)
2718 info_table.add_column("Value", style="white")
2719
2720 info_table.add_row("User", f"[green]{username}[/green]")
2721 info_table.add_row("Atom ID", f"[blue]{entry.id}[/blue]")
2722 info_table.add_row("Title", entry.title)
2723 info_table.add_row("Link", str(entry.link))
2724
2725 if entry.published:
2726 info_table.add_row("Published", entry.published.strftime("%Y-%m-%d %H:%M:%S UTC"))
2727
2728 info_table.add_row("Updated", entry.updated.strftime("%Y-%m-%d %H:%M:%S UTC"))
2729
2730 if entry.summary:
2731 # Truncate long summaries
2732 summary = entry.summary[:200] + "..." if len(entry.summary) > 200 else entry.summary
2733 info_table.add_row("Summary", summary)
2734
2735 if entry.categories:
2736 categories_text = ", ".join(entry.categories)
2737 info_table.add_row("Categories", categories_text)
2738
2739 if entry.author:
2740 author_info = []
2741 if "name" in entry.author:
2742 author_info.append(entry.author["name"])
2743 if "email" in entry.author:
2744 author_info.append(f"<{entry.author['email']}>")
2745 if author_info:
2746 info_table.add_row("Author", " ".join(author_info))
2747
2748 if entry.content_type:
2749 info_table.add_row("Content Type", entry.content_type)
2750
2751 if entry.rights:
2752 info_table.add_row("Rights", entry.rights)
2753
2754 if entry.source:
2755 info_table.add_row("Source Feed", entry.source)
2756
2757 panel = Panel(
2758 info_table,
2759 title=f"[bold]Entry Information[/bold]",
2760 border_style="blue"
2761 )
2762
2763 console.print(panel)
2764
2765
2766def _display_link_info(entry, username: str, ref_index: ReferenceIndex) -> None:
2767 """Display inbound and outbound link information."""
2768
2769 # Get links
2770 outbound_refs = ref_index.get_outbound_refs(username, entry.id)
2771 inbound_refs = ref_index.get_inbound_refs(username, entry.id)
2772
2773 if not outbound_refs and not inbound_refs:
2774 console.print("\n[dim]No cross-references found for this entry.[/dim]")
2775 return
2776
2777 # Create links table
2778 links_table = Table(title="Cross-References")
2779 links_table.add_column("Direction", style="cyan", width=10)
2780 links_table.add_column("Target/Source", style="green", width=20)
2781 links_table.add_column("URL", style="blue", width=50)
2782
2783 # Add outbound references
2784 for ref in outbound_refs:
2785 target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
2786 links_table.add_row("→ Out", target_info, ref.target_url)
2787
2788 # Add inbound references
2789 for ref in inbound_refs:
2790 source_info = f"{ref.source_username}:{ref.source_entry_id}"
2791 links_table.add_row("← In", source_info, ref.target_url)
2792
2793 console.print()
2794 console.print(links_table)
2795
2796 # Summary
2797 console.print(f"\n[bold]Summary:[/bold] {len(outbound_refs)} outbound, {len(inbound_refs)} inbound references")
2798
2799
2800def _display_content(content: str) -> None:
2801 """Display the full content of the entry."""
2802
2803 # Truncate very long content
2804 display_content = content
2805 if len(content) > 5000:
2806 display_content = content[:5000] + "\n\n[... content truncated ...]"
2807
2808 panel = Panel(
2809 display_content,
2810 title="[bold]Entry Content[/bold]",
2811 border_style="green",
2812 expand=False
2813 )
2814
2815 console.print()
2816 console.print(panel)
2817
2818
2819def _display_entry_info_tsv(entry, username: str, ref_index: Optional[ReferenceIndex], show_content: bool) -> None:
2820 """Display entry information in TSV format."""
2821
2822 # Basic info
2823 print("Field\tValue")
2824 print(f"User\t{username}")
2825 print(f"Atom ID\t{entry.id}")
2826 print(f"Title\t{entry.title.replace(chr(9), ' ').replace(chr(10), ' ').replace(chr(13), ' ')}")
2827 print(f"Link\t{entry.link}")
2828
2829 if entry.published:
2830 print(f"Published\t{entry.published.strftime('%Y-%m-%d %H:%M:%S UTC')}")
2831
2832 print(f"Updated\t{entry.updated.strftime('%Y-%m-%d %H:%M:%S UTC')}")
2833
2834 if entry.summary:
2835 # Escape tabs and newlines in summary
2836 summary = entry.summary.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
2837 print(f"Summary\t{summary}")
2838
2839 if entry.categories:
2840 print(f"Categories\t{', '.join(entry.categories)}")
2841
2842 if entry.author:
2843 author_info = []
2844 if "name" in entry.author:
2845 author_info.append(entry.author["name"])
2846 if "email" in entry.author:
2847 author_info.append(f"<{entry.author['email']}>")
2848 if author_info:
2849 print(f"Author\t{' '.join(author_info)}")
2850
2851 if entry.content_type:
2852 print(f"Content Type\t{entry.content_type}")
2853
2854 if entry.rights:
2855 print(f"Rights\t{entry.rights}")
2856
2857 if entry.source:
2858 print(f"Source Feed\t{entry.source}")
2859
2860 # Add reference info if available
2861 if ref_index:
2862 outbound_refs = ref_index.get_outbound_refs(username, entry.id)
2863 inbound_refs = ref_index.get_inbound_refs(username, entry.id)
2864
2865 print(f"Outbound References\t{len(outbound_refs)}")
2866 print(f"Inbound References\t{len(inbound_refs)}")
2867
2868 # Show each reference
2869 for ref in outbound_refs:
2870 target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
2871 print(f"Outbound Reference\t{target_info}\t{ref.target_url}")
2872
2873 for ref in inbound_refs:
2874 source_info = f"{ref.source_username}:{ref.source_entry_id}"
2875 print(f"Inbound Reference\t{source_info}\t{ref.target_url}")
2876
2877 # Show content if requested
2878 if show_content and entry.content:
2879 # Escape tabs and newlines in content
2880 content = entry.content.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
2881 print(f"Content\t{content}")
2882</file>
2883
2884<file path="src/thicket/cli/commands/init.py">
2885"""Initialize command for thicket."""
2886
2887from pathlib import Path
2888from typing import Optional
2889
2890import typer
2891from pydantic import ValidationError
2892
2893from ...core.git_store import GitStore
2894from ...models import ThicketConfig
2895from ..main import app
2896from ..utils import print_error, print_success, save_config
2897
2898
2899@app.command()
2900def init(
2901 git_store: Path = typer.Argument(..., help="Path to Git repository for storing feeds"),
2902 cache_dir: Optional[Path] = typer.Option(
2903 None, "--cache-dir", "-c", help="Cache directory (default: ~/.cache/thicket)"
2904 ),
2905 config_file: Optional[Path] = typer.Option(
2906 None, "--config", help="Configuration file path (default: thicket.yaml)"
2907 ),
2908 force: bool = typer.Option(
2909 False, "--force", "-f", help="Overwrite existing configuration"
2910 ),
2911) -> None:
2912 """Initialize a new thicket configuration and Git store."""
2913
2914 # Set default paths
2915 if cache_dir is None:
2916 from platformdirs import user_cache_dir
2917 cache_dir = Path(user_cache_dir("thicket"))
2918
2919 if config_file is None:
2920 config_file = Path("thicket.yaml")
2921
2922 # Check if config already exists
2923 if config_file.exists() and not force:
2924 print_error(f"Configuration file already exists: {config_file}")
2925 print_error("Use --force to overwrite")
2926 raise typer.Exit(1)
2927
2928 # Create cache directory
2929 cache_dir.mkdir(parents=True, exist_ok=True)
2930
2931 # Create Git store
2932 try:
2933 GitStore(git_store)
2934 print_success(f"Initialized Git store at: {git_store}")
2935 except Exception as e:
2936 print_error(f"Failed to initialize Git store: {e}")
2937 raise typer.Exit(1) from e
2938
2939 # Create configuration
2940 try:
2941 config = ThicketConfig(
2942 git_store=git_store,
2943 cache_dir=cache_dir,
2944 users=[]
2945 )
2946
2947 save_config(config, config_file)
2948 print_success(f"Created configuration file: {config_file}")
2949
2950 except ValidationError as e:
2951 print_error(f"Invalid configuration: {e}")
2952 raise typer.Exit(1) from e
2953 except Exception as e:
2954 print_error(f"Failed to create configuration: {e}")
2955 raise typer.Exit(1) from e
2956
2957 print_success("Thicket initialized successfully!")
2958 print_success(f"Git store: {git_store}")
2959 print_success(f"Cache directory: {cache_dir}")
2960 print_success(f"Configuration: {config_file}")
2961 print_success("Run 'thicket add user' to add your first user and feed.")
2962</file>
2963
2964<file path="src/thicket/cli/__init__.py">
2965"""CLI interface for thicket."""
2966
2967from .main import app
2968
2969__all__ = ["app"]
2970</file>
2971
2972<file path="src/thicket/core/__init__.py">
2973"""Core business logic for thicket."""
2974
2975from .feed_parser import FeedParser
2976from .git_store import GitStore
2977
2978__all__ = ["FeedParser", "GitStore"]
2979</file>
2980
2981<file path="src/thicket/core/feed_parser.py">
2982"""Feed parsing and normalization with auto-discovery."""
2983
2984from datetime import datetime
2985from typing import Optional
2986from urllib.parse import urlparse
2987
2988import bleach
2989import feedparser
2990import httpx
2991from pydantic import HttpUrl, ValidationError
2992
2993from ..models import AtomEntry, FeedMetadata
2994
2995
2996class FeedParser:
2997 """Parser for RSS/Atom feeds with normalization and auto-discovery."""
2998
2999 def __init__(self, user_agent: str = "thicket/0.1.0"):
3000 """Initialize the feed parser."""
3001 self.user_agent = user_agent
3002 self.allowed_tags = [
3003 "a", "abbr", "acronym", "b", "blockquote", "br", "code", "em",
3004 "i", "li", "ol", "p", "pre", "strong", "ul", "h1", "h2", "h3",
3005 "h4", "h5", "h6", "img", "div", "span",
3006 ]
3007 self.allowed_attributes = {
3008 "a": ["href", "title"],
3009 "abbr": ["title"],
3010 "acronym": ["title"],
3011 "img": ["src", "alt", "title", "width", "height"],
3012 "blockquote": ["cite"],
3013 }
3014
3015 async def fetch_feed(self, url: HttpUrl) -> str:
3016 """Fetch feed content from URL."""
3017 async with httpx.AsyncClient() as client:
3018 response = await client.get(
3019 str(url),
3020 headers={"User-Agent": self.user_agent},
3021 timeout=30.0,
3022 follow_redirects=True,
3023 )
3024 response.raise_for_status()
3025 return response.text
3026
3027 def parse_feed(self, content: str, source_url: Optional[HttpUrl] = None) -> tuple[FeedMetadata, list[AtomEntry]]:
3028 """Parse feed content and return metadata and entries."""
3029 parsed = feedparser.parse(content)
3030
3031 if parsed.bozo and parsed.bozo_exception:
3032 # Try to continue with potentially malformed feed
3033 pass
3034
3035 # Extract feed metadata
3036 feed_meta = self._extract_feed_metadata(parsed.feed)
3037
3038 # Extract and normalize entries
3039 entries = []
3040 for entry in parsed.entries:
3041 try:
3042 atom_entry = self._normalize_entry(entry, source_url)
3043 entries.append(atom_entry)
3044 except Exception as e:
3045 # Log error but continue processing other entries
3046 print(f"Error processing entry {getattr(entry, 'id', 'unknown')}: {e}")
3047 continue
3048
3049 return feed_meta, entries
3050
3051 def _extract_feed_metadata(self, feed: feedparser.FeedParserDict) -> FeedMetadata:
3052 """Extract metadata from feed for auto-discovery."""
3053 # Parse author information
3054 author_name = None
3055 author_email = None
3056 author_uri = None
3057
3058 if hasattr(feed, 'author_detail'):
3059 author_name = feed.author_detail.get('name')
3060 author_email = feed.author_detail.get('email')
3061 author_uri = feed.author_detail.get('href')
3062 elif hasattr(feed, 'author'):
3063 author_name = feed.author
3064
3065 # Parse managing editor for RSS feeds
3066 if not author_email and hasattr(feed, 'managingEditor'):
3067 author_email = feed.managingEditor
3068
3069 # Parse feed link
3070 feed_link = None
3071 if hasattr(feed, 'link'):
3072 try:
3073 feed_link = HttpUrl(feed.link)
3074 except ValidationError:
3075 pass
3076
3077 # Parse image/icon/logo
3078 logo = None
3079 icon = None
3080 image_url = None
3081
3082 if hasattr(feed, 'image'):
3083 try:
3084 image_url = HttpUrl(feed.image.get('href', feed.image.get('url', '')))
3085 except (ValidationError, AttributeError):
3086 pass
3087
3088 if hasattr(feed, 'icon'):
3089 try:
3090 icon = HttpUrl(feed.icon)
3091 except ValidationError:
3092 pass
3093
3094 if hasattr(feed, 'logo'):
3095 try:
3096 logo = HttpUrl(feed.logo)
3097 except ValidationError:
3098 pass
3099
3100 return FeedMetadata(
3101 title=getattr(feed, 'title', None),
3102 author_name=author_name,
3103 author_email=author_email,
3104 author_uri=HttpUrl(author_uri) if author_uri else None,
3105 link=feed_link,
3106 logo=logo,
3107 icon=icon,
3108 image_url=image_url,
3109 description=getattr(feed, 'description', None),
3110 )
3111
3112 def _normalize_entry(self, entry: feedparser.FeedParserDict, source_url: Optional[HttpUrl] = None) -> AtomEntry:
3113 """Normalize an entry to Atom format."""
3114 # Parse timestamps
3115 updated = self._parse_timestamp(entry.get('updated_parsed') or entry.get('published_parsed'))
3116 published = self._parse_timestamp(entry.get('published_parsed'))
3117
3118 # Parse content
3119 content = self._extract_content(entry)
3120 content_type = self._extract_content_type(entry)
3121
3122 # Parse author
3123 author = self._extract_author(entry)
3124
3125 # Parse categories/tags
3126 categories = []
3127 if hasattr(entry, 'tags'):
3128 categories = [tag.get('term', '') for tag in entry.tags if tag.get('term')]
3129
3130 # Sanitize HTML content
3131 if content:
3132 content = self._sanitize_html(content)
3133
3134 summary = entry.get('summary', '')
3135 if summary:
3136 summary = self._sanitize_html(summary)
3137
3138 return AtomEntry(
3139 id=entry.get('id', entry.get('link', '')),
3140 title=entry.get('title', ''),
3141 link=HttpUrl(entry.get('link', '')),
3142 updated=updated,
3143 published=published,
3144 summary=summary or None,
3145 content=content or None,
3146 content_type=content_type,
3147 author=author,
3148 categories=categories,
3149 rights=entry.get('rights', None),
3150 source=str(source_url) if source_url else None,
3151 )
3152
3153 def _parse_timestamp(self, time_struct) -> datetime:
3154 """Parse feedparser time struct to datetime."""
3155 if time_struct:
3156 return datetime(*time_struct[:6])
3157 return datetime.now()
3158
3159 def _extract_content(self, entry: feedparser.FeedParserDict) -> Optional[str]:
3160 """Extract the best content from an entry."""
3161 # Prefer content over summary
3162 if hasattr(entry, 'content') and entry.content:
3163 # Find the best content (prefer text/html, then text/plain)
3164 for content_item in entry.content:
3165 if content_item.get('type') in ['text/html', 'html']:
3166 return content_item.get('value', '')
3167 elif content_item.get('type') in ['text/plain', 'text']:
3168 return content_item.get('value', '')
3169 # Fallback to first content item
3170 return entry.content[0].get('value', '')
3171
3172 # Fallback to summary
3173 return entry.get('summary', '')
3174
3175 def _extract_content_type(self, entry: feedparser.FeedParserDict) -> str:
3176 """Extract content type from entry."""
3177 if hasattr(entry, 'content') and entry.content:
3178 content_type = entry.content[0].get('type', 'html')
3179 # Normalize content type
3180 if content_type in ['text/html', 'html']:
3181 return 'html'
3182 elif content_type in ['text/plain', 'text']:
3183 return 'text'
3184 elif content_type == 'xhtml':
3185 return 'xhtml'
3186 return 'html'
3187
3188 def _extract_author(self, entry: feedparser.FeedParserDict) -> Optional[dict]:
3189 """Extract author information from entry."""
3190 author = {}
3191
3192 if hasattr(entry, 'author_detail'):
3193 author.update({
3194 'name': entry.author_detail.get('name'),
3195 'email': entry.author_detail.get('email'),
3196 'uri': entry.author_detail.get('href'),
3197 })
3198 elif hasattr(entry, 'author'):
3199 author['name'] = entry.author
3200
3201 return author if author else None
3202
3203 def _sanitize_html(self, html: str) -> str:
3204 """Sanitize HTML content to prevent XSS."""
3205 return bleach.clean(
3206 html,
3207 tags=self.allowed_tags,
3208 attributes=self.allowed_attributes,
3209 strip=True,
3210 )
3211
3212 def sanitize_entry_id(self, entry_id: str) -> str:
3213 """Sanitize entry ID to be a safe filename."""
3214 # Parse URL to get meaningful parts
3215 parsed = urlparse(entry_id)
3216
3217 # Start with the path component
3218 if parsed.path:
3219 # Remove leading slash and replace problematic characters
3220 safe_id = parsed.path.lstrip('/').replace('/', '_').replace('\\', '_')
3221 else:
3222 # Use the entire ID as fallback
3223 safe_id = entry_id
3224
3225 # Replace problematic characters
3226 safe_chars = []
3227 for char in safe_id:
3228 if char.isalnum() or char in '-_.':
3229 safe_chars.append(char)
3230 else:
3231 safe_chars.append('_')
3232
3233 safe_id = ''.join(safe_chars)
3234
3235 # Ensure it's not too long (max 200 chars)
3236 if len(safe_id) > 200:
3237 safe_id = safe_id[:200]
3238
3239 # Ensure it's not empty
3240 if not safe_id:
3241 safe_id = "entry"
3242
3243 return safe_id
3244</file>
3245
3246<file path="src/thicket/core/reference_parser.py">
3247"""Reference detection and parsing for blog entries."""
3248
3249import re
3250from typing import Optional
3251from urllib.parse import urlparse
3252
3253from ..models import AtomEntry
3254
3255
3256class BlogReference:
3257 """Represents a reference from one blog entry to another."""
3258
3259 def __init__(
3260 self,
3261 source_entry_id: str,
3262 source_username: str,
3263 target_url: str,
3264 target_username: Optional[str] = None,
3265 target_entry_id: Optional[str] = None,
3266 ):
3267 self.source_entry_id = source_entry_id
3268 self.source_username = source_username
3269 self.target_url = target_url
3270 self.target_username = target_username
3271 self.target_entry_id = target_entry_id
3272
3273 def to_dict(self) -> dict:
3274 """Convert to dictionary for JSON serialization."""
3275 result = {
3276 "source_entry_id": self.source_entry_id,
3277 "source_username": self.source_username,
3278 "target_url": self.target_url,
3279 }
3280
3281 # Only include optional fields if they are not None
3282 if self.target_username is not None:
3283 result["target_username"] = self.target_username
3284 if self.target_entry_id is not None:
3285 result["target_entry_id"] = self.target_entry_id
3286
3287 return result
3288
3289 @classmethod
3290 def from_dict(cls, data: dict) -> "BlogReference":
3291 """Create from dictionary."""
3292 return cls(
3293 source_entry_id=data["source_entry_id"],
3294 source_username=data["source_username"],
3295 target_url=data["target_url"],
3296 target_username=data.get("target_username"),
3297 target_entry_id=data.get("target_entry_id"),
3298 )
3299
3300
3301class ReferenceIndex:
3302 """Index of blog-to-blog references for creating threaded views."""
3303
3304 def __init__(self):
3305 self.references: list[BlogReference] = []
3306 self.outbound_refs: dict[
3307 str, list[BlogReference]
3308 ] = {} # entry_id -> outbound refs
3309 self.inbound_refs: dict[
3310 str, list[BlogReference]
3311 ] = {} # entry_id -> inbound refs
3312 self.user_domains: dict[str, set[str]] = {} # username -> set of domains
3313
3314 def add_reference(self, ref: BlogReference) -> None:
3315 """Add a reference to the index."""
3316 self.references.append(ref)
3317
3318 # Update outbound references
3319 source_key = f"{ref.source_username}:{ref.source_entry_id}"
3320 if source_key not in self.outbound_refs:
3321 self.outbound_refs[source_key] = []
3322 self.outbound_refs[source_key].append(ref)
3323
3324 # Update inbound references if we can identify the target
3325 if ref.target_username and ref.target_entry_id:
3326 target_key = f"{ref.target_username}:{ref.target_entry_id}"
3327 if target_key not in self.inbound_refs:
3328 self.inbound_refs[target_key] = []
3329 self.inbound_refs[target_key].append(ref)
3330
3331 def get_outbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
3332 """Get all outbound references from an entry."""
3333 key = f"{username}:{entry_id}"
3334 return self.outbound_refs.get(key, [])
3335
3336 def get_inbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
3337 """Get all inbound references to an entry."""
3338 key = f"{username}:{entry_id}"
3339 return self.inbound_refs.get(key, [])
3340
3341 def get_thread_members(self, username: str, entry_id: str) -> set[tuple[str, str]]:
3342 """Get all entries that are part of the same thread."""
3343 visited = set()
3344 to_visit = [(username, entry_id)]
3345 thread_members = set()
3346
3347 while to_visit:
3348 current_user, current_entry = to_visit.pop()
3349 if (current_user, current_entry) in visited:
3350 continue
3351
3352 visited.add((current_user, current_entry))
3353 thread_members.add((current_user, current_entry))
3354
3355 # Add outbound references
3356 for ref in self.get_outbound_refs(current_user, current_entry):
3357 if ref.target_username and ref.target_entry_id:
3358 to_visit.append((ref.target_username, ref.target_entry_id))
3359
3360 # Add inbound references
3361 for ref in self.get_inbound_refs(current_user, current_entry):
3362 to_visit.append((ref.source_username, ref.source_entry_id))
3363
3364 return thread_members
3365
3366 def to_dict(self) -> dict:
3367 """Convert to dictionary for JSON serialization."""
3368 return {
3369 "references": [ref.to_dict() for ref in self.references],
3370 "user_domains": {k: list(v) for k, v in self.user_domains.items()},
3371 }
3372
3373 @classmethod
3374 def from_dict(cls, data: dict) -> "ReferenceIndex":
3375 """Create from dictionary."""
3376 index = cls()
3377 for ref_data in data.get("references", []):
3378 ref = BlogReference.from_dict(ref_data)
3379 index.add_reference(ref)
3380
3381 for username, domains in data.get("user_domains", {}).items():
3382 index.user_domains[username] = set(domains)
3383
3384 return index
3385
3386
3387class ReferenceParser:
3388 """Parses blog entries to detect references to other blogs."""
3389
3390 def __init__(self):
3391 # Common blog platforms and patterns
3392 self.blog_patterns = [
3393 r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*", # Common blog domains
3394 r"https?://[^/]+\.github\.io/.*", # GitHub Pages
3395 r"https?://[^/]+\.substack\.com/.*", # Substack
3396 r"https?://medium\.com/.*", # Medium
3397 r"https?://[^/]+\.wordpress\.com/.*", # WordPress.com
3398 r"https?://[^/]+\.blogspot\.com/.*", # Blogger
3399 ]
3400
3401 # Compile regex patterns
3402 self.link_pattern = re.compile(
3403 r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL
3404 )
3405 self.url_pattern = re.compile(r'https?://[^\s<>"]+')
3406
3407 def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:
3408 """Extract all links from HTML content."""
3409 links = []
3410
3411 # Extract links from <a> tags
3412 for match in self.link_pattern.finditer(html_content):
3413 url = match.group(1)
3414 text = re.sub(
3415 r"<[^>]+>", "", match.group(2)
3416 ).strip() # Remove HTML tags from link text
3417 links.append((url, text))
3418
3419 return links
3420
3421 def is_blog_url(self, url: str) -> bool:
3422 """Check if a URL likely points to a blog post."""
3423 for pattern in self.blog_patterns:
3424 if re.match(pattern, url):
3425 return True
3426 return False
3427
3428 def _is_likely_blog_post_url(self, url: str) -> bool:
3429 """Check if a same-domain URL likely points to a blog post (not CSS, images, etc.)."""
3430 parsed_url = urlparse(url)
3431 path = parsed_url.path.lower()
3432
3433 # Skip obvious non-blog content
3434 if any(path.endswith(ext) for ext in ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.pdf', '.xml', '.json']):
3435 return False
3436
3437 # Skip common non-blog paths
3438 if any(segment in path for segment in ['/static/', '/assets/', '/css/', '/js/', '/images/', '/img/', '/media/', '/uploads/']):
3439 return False
3440
3441 # Skip fragment-only links (same page anchors)
3442 if not path or path == '/':
3443 return False
3444
3445 # Look for positive indicators of blog posts
3446 # Common blog post patterns: dates, slugs, post indicators
3447 blog_indicators = [
3448 r'/\d{4}/', # Year in path
3449 r'/\d{4}/\d{2}/', # Year/month in path
3450 r'/blog/',
3451 r'/post/',
3452 r'/posts/',
3453 r'/articles?/',
3454 r'/notes?/',
3455 r'/entries/',
3456 r'/writing/',
3457 ]
3458
3459 for pattern in blog_indicators:
3460 if re.search(pattern, path):
3461 return True
3462
3463 # If it has a reasonable path depth and doesn't match exclusions, likely a blog post
3464 path_segments = [seg for seg in path.split('/') if seg]
3465 return len(path_segments) >= 1 # At least one meaningful path segment
3466
3467 def resolve_target_user(
3468 self, url: str, user_domains: dict[str, set[str]]
3469 ) -> Optional[str]:
3470 """Try to resolve a URL to a known user based on domain mapping."""
3471 parsed_url = urlparse(url)
3472 domain = parsed_url.netloc.lower()
3473
3474 for username, domains in user_domains.items():
3475 if domain in domains:
3476 return username
3477
3478 return None
3479
3480 def extract_references(
3481 self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]]
3482 ) -> list[BlogReference]:
3483 """Extract all blog references from an entry."""
3484 references = []
3485
3486 # Combine all text content for analysis
3487 content_to_search = []
3488 if entry.content:
3489 content_to_search.append(entry.content)
3490 if entry.summary:
3491 content_to_search.append(entry.summary)
3492
3493 for content in content_to_search:
3494 links = self.extract_links_from_html(content)
3495
3496 for url, _link_text in links:
3497 entry_domain = (
3498 urlparse(str(entry.link)).netloc.lower() if entry.link else ""
3499 )
3500 link_domain = urlparse(url).netloc.lower()
3501
3502 # Check if this looks like a blog URL
3503 if not self.is_blog_url(url):
3504 continue
3505
3506 # For same-domain links, apply additional filtering to avoid non-blog content
3507 if link_domain == entry_domain:
3508 # Only include same-domain links that look like blog posts
3509 if not self._is_likely_blog_post_url(url):
3510 continue
3511
3512 # Try to resolve to a known user
3513 if link_domain == entry_domain:
3514 # Same domain - target user is the same as source user
3515 target_username: Optional[str] = username
3516 else:
3517 # Different domain - try to resolve
3518 target_username = self.resolve_target_user(url, user_domains)
3519
3520 ref = BlogReference(
3521 source_entry_id=entry.id,
3522 source_username=username,
3523 target_url=url,
3524 target_username=target_username,
3525 target_entry_id=None, # Will be resolved later if possible
3526 )
3527
3528 references.append(ref)
3529
3530 return references
3531
3532 def build_user_domain_mapping(self, git_store: "GitStore") -> dict[str, set[str]]:
3533 """Build mapping of usernames to their known domains."""
3534 user_domains = {}
3535 index = git_store._load_index()
3536
3537 for username, user_metadata in index.users.items():
3538 domains = set()
3539
3540 # Add domains from feeds
3541 for feed_url in user_metadata.feeds:
3542 domain = urlparse(feed_url).netloc.lower()
3543 if domain:
3544 domains.add(domain)
3545
3546 # Add domain from homepage
3547 if user_metadata.homepage:
3548 domain = urlparse(str(user_metadata.homepage)).netloc.lower()
3549 if domain:
3550 domains.add(domain)
3551
3552 user_domains[username] = domains
3553
3554 return user_domains
3555
3556 def _build_url_to_entry_mapping(self, git_store: "GitStore") -> dict[str, str]:
3557 """Build a comprehensive mapping from URLs to entry IDs using git store data.
3558
3559 This creates a bidirectional mapping that handles:
3560 - Entry link URLs -> Entry IDs
3561 - URL variations (with/without www, http/https)
3562 - Multiple URLs pointing to the same entry
3563 """
3564 url_to_entry: dict[str, str] = {}
3565
3566 # Load index to get all users
3567 index = git_store._load_index()
3568
3569 for username in index.users.keys():
3570 entries = git_store.list_entries(username)
3571
3572 for entry in entries:
3573 if entry.link:
3574 link_url = str(entry.link)
3575 entry_id = entry.id
3576
3577 # Map the canonical link URL
3578 url_to_entry[link_url] = entry_id
3579
3580 # Handle common URL variations
3581 parsed = urlparse(link_url)
3582 if parsed.netloc and parsed.path:
3583 # Add version without www
3584 if parsed.netloc.startswith('www.'):
3585 no_www_url = f"{parsed.scheme}://{parsed.netloc[4:]}{parsed.path}"
3586 if parsed.query:
3587 no_www_url += f"?{parsed.query}"
3588 if parsed.fragment:
3589 no_www_url += f"#{parsed.fragment}"
3590 url_to_entry[no_www_url] = entry_id
3591
3592 # Add version with www if not present
3593 elif not parsed.netloc.startswith('www.'):
3594 www_url = f"{parsed.scheme}://www.{parsed.netloc}{parsed.path}"
3595 if parsed.query:
3596 www_url += f"?{parsed.query}"
3597 if parsed.fragment:
3598 www_url += f"#{parsed.fragment}"
3599 url_to_entry[www_url] = entry_id
3600
3601 # Add http/https variations
3602 if parsed.scheme == 'https':
3603 http_url = link_url.replace('https://', 'http://', 1)
3604 url_to_entry[http_url] = entry_id
3605 elif parsed.scheme == 'http':
3606 https_url = link_url.replace('http://', 'https://', 1)
3607 url_to_entry[https_url] = entry_id
3608
3609 return url_to_entry
3610
3611 def _normalize_url(self, url: str) -> str:
3612 """Normalize URL for consistent matching.
3613
3614 Handles common variations like trailing slashes, fragments, etc.
3615 """
3616 parsed = urlparse(url)
3617
3618 # Remove trailing slash from path
3619 path = parsed.path.rstrip('/') if parsed.path != '/' else parsed.path
3620
3621 # Reconstruct without fragment for consistent matching
3622 normalized = f"{parsed.scheme}://{parsed.netloc}{path}"
3623 if parsed.query:
3624 normalized += f"?{parsed.query}"
3625
3626 return normalized
3627
3628 def resolve_target_entry_ids(
3629 self, references: list[BlogReference], git_store: "GitStore"
3630 ) -> list[BlogReference]:
3631 """Resolve target_entry_id for references using comprehensive URL mapping."""
3632 resolved_refs = []
3633
3634 # Build comprehensive URL to entry ID mapping
3635 url_to_entry = self._build_url_to_entry_mapping(git_store)
3636
3637 for ref in references:
3638 # If we already have a target_entry_id, keep the reference as-is
3639 if ref.target_entry_id is not None:
3640 resolved_refs.append(ref)
3641 continue
3642
3643 # If we don't have a target_username, we can't resolve it
3644 if ref.target_username is None:
3645 resolved_refs.append(ref)
3646 continue
3647
3648 # Try to resolve using URL mapping
3649 resolved_entry_id = None
3650
3651 # First, try exact match
3652 if ref.target_url in url_to_entry:
3653 resolved_entry_id = url_to_entry[ref.target_url]
3654 else:
3655 # Try normalized URL matching
3656 normalized_target = self._normalize_url(ref.target_url)
3657 if normalized_target in url_to_entry:
3658 resolved_entry_id = url_to_entry[normalized_target]
3659 else:
3660 # Try URL variations
3661 for mapped_url, entry_id in url_to_entry.items():
3662 if self._normalize_url(mapped_url) == normalized_target:
3663 resolved_entry_id = entry_id
3664 break
3665
3666 # Verify the resolved entry belongs to the target username
3667 if resolved_entry_id:
3668 # Double-check by loading the actual entry
3669 entries = git_store.list_entries(ref.target_username)
3670 entry_found = any(entry.id == resolved_entry_id for entry in entries)
3671 if not entry_found:
3672 resolved_entry_id = None
3673
3674 # Create a new reference with the resolved target_entry_id
3675 resolved_ref = BlogReference(
3676 source_entry_id=ref.source_entry_id,
3677 source_username=ref.source_username,
3678 target_url=ref.target_url,
3679 target_username=ref.target_username,
3680 target_entry_id=resolved_entry_id,
3681 )
3682 resolved_refs.append(resolved_ref)
3683
3684 return resolved_refs
3685</file>
3686
3687<file path="src/thicket/models/__init__.py">
3688"""Data models for thicket."""
3689
3690from .config import ThicketConfig, UserConfig
3691from .feed import AtomEntry, DuplicateMap, FeedMetadata
3692from .user import GitStoreIndex, UserMetadata
3693
3694__all__ = [
3695 "ThicketConfig",
3696 "UserConfig",
3697 "AtomEntry",
3698 "DuplicateMap",
3699 "FeedMetadata",
3700 "GitStoreIndex",
3701 "UserMetadata",
3702]
3703</file>
3704
3705<file path="src/thicket/models/feed.py">
3706"""Feed and entry models for thicket."""
3707
3708from datetime import datetime
3709from typing import TYPE_CHECKING, Optional
3710
3711from pydantic import BaseModel, ConfigDict, EmailStr, HttpUrl
3712
3713if TYPE_CHECKING:
3714 from .config import UserConfig
3715
3716
3717class AtomEntry(BaseModel):
3718 """Represents an Atom feed entry stored in the Git repository."""
3719
3720 model_config = ConfigDict(
3721 json_encoders={datetime: lambda v: v.isoformat()},
3722 str_strip_whitespace=True,
3723 )
3724
3725 id: str # Original Atom ID
3726 title: str
3727 link: HttpUrl
3728 updated: datetime
3729 published: Optional[datetime] = None
3730 summary: Optional[str] = None
3731 content: Optional[str] = None # Full body content from Atom entry
3732 content_type: Optional[str] = "html" # text, html, xhtml
3733 author: Optional[dict] = None
3734 categories: list[str] = []
3735 rights: Optional[str] = None # Copyright info
3736 source: Optional[str] = None # Source feed URL
3737
3738
3739class FeedMetadata(BaseModel):
3740 """Metadata extracted from a feed for auto-discovery."""
3741
3742 title: Optional[str] = None
3743 author_name: Optional[str] = None
3744 author_email: Optional[EmailStr] = None
3745 author_uri: Optional[HttpUrl] = None
3746 link: Optional[HttpUrl] = None
3747 logo: Optional[HttpUrl] = None
3748 icon: Optional[HttpUrl] = None
3749 image_url: Optional[HttpUrl] = None
3750 description: Optional[str] = None
3751
3752 def to_user_config(self, username: str, feed_url: HttpUrl) -> "UserConfig":
3753 """Convert discovered metadata to UserConfig with fallbacks."""
3754 from .config import UserConfig
3755
3756 return UserConfig(
3757 username=username,
3758 feeds=[feed_url],
3759 display_name=self.author_name or self.title,
3760 email=self.author_email,
3761 homepage=self.author_uri or self.link,
3762 icon=self.logo or self.icon or self.image_url,
3763 )
3764
3765
3766class DuplicateMap(BaseModel):
3767 """Maps duplicate entry IDs to canonical entry IDs."""
3768
3769 duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
3770 comment: str = "Entry IDs that map to the same canonical content"
3771
3772 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
3773 """Add a duplicate mapping."""
3774 self.duplicates[duplicate_id] = canonical_id
3775
3776 def remove_duplicate(self, duplicate_id: str) -> bool:
3777 """Remove a duplicate mapping. Returns True if existed."""
3778 return self.duplicates.pop(duplicate_id, None) is not None
3779
3780 def get_canonical(self, entry_id: str) -> str:
3781 """Get canonical ID for an entry (returns original if not duplicate)."""
3782 return self.duplicates.get(entry_id, entry_id)
3783
3784 def is_duplicate(self, entry_id: str) -> bool:
3785 """Check if entry ID is marked as duplicate."""
3786 return entry_id in self.duplicates
3787
3788 def get_duplicates_for_canonical(self, canonical_id: str) -> list[str]:
3789 """Get all duplicate IDs that map to a canonical ID."""
3790 return [
3791 duplicate_id
3792 for duplicate_id, canonical in self.duplicates.items()
3793 if canonical == canonical_id
3794 ]
3795</file>
3796
3797<file path="src/thicket/models/user.py">
3798"""User metadata models for thicket."""
3799
3800from datetime import datetime
3801from typing import Optional
3802
3803from pydantic import BaseModel, ConfigDict
3804
3805
3806class UserMetadata(BaseModel):
3807 """Metadata about a user stored in the Git repository."""
3808
3809 model_config = ConfigDict(
3810 json_encoders={datetime: lambda v: v.isoformat()},
3811 str_strip_whitespace=True,
3812 )
3813
3814 username: str
3815 display_name: Optional[str] = None
3816 email: Optional[str] = None
3817 homepage: Optional[str] = None
3818 icon: Optional[str] = None
3819 feeds: list[str] = []
3820 directory: str # Directory name in Git store
3821 created: datetime
3822 last_updated: datetime
3823 entry_count: int = 0
3824
3825 def update_timestamp(self) -> None:
3826 """Update the last_updated timestamp to now."""
3827 self.last_updated = datetime.now()
3828
3829 def increment_entry_count(self, count: int = 1) -> None:
3830 """Increment the entry count by the given amount."""
3831 self.entry_count += count
3832 self.update_timestamp()
3833
3834
3835class GitStoreIndex(BaseModel):
3836 """Index of all users and their directories in the Git store."""
3837
3838 model_config = ConfigDict(
3839 json_encoders={datetime: lambda v: v.isoformat()}
3840 )
3841
3842 users: dict[str, UserMetadata] = {} # username -> UserMetadata
3843 created: datetime
3844 last_updated: datetime
3845 total_entries: int = 0
3846
3847 def add_user(self, user_metadata: UserMetadata) -> None:
3848 """Add or update a user in the index."""
3849 self.users[user_metadata.username] = user_metadata
3850 self.last_updated = datetime.now()
3851
3852 def remove_user(self, username: str) -> bool:
3853 """Remove a user from the index. Returns True if user existed."""
3854 if username in self.users:
3855 del self.users[username]
3856 self.last_updated = datetime.now()
3857 return True
3858 return False
3859
3860 def get_user(self, username: str) -> Optional[UserMetadata]:
3861 """Get user metadata by username."""
3862 return self.users.get(username)
3863
3864 def update_entry_count(self, username: str, count: int) -> None:
3865 """Update entry count for a user and total."""
3866 user = self.get_user(username)
3867 if user:
3868 user.increment_entry_count(count)
3869 self.total_entries += count
3870 self.last_updated = datetime.now()
3871
3872 def recalculate_totals(self) -> None:
3873 """Recalculate total entries from all users."""
3874 self.total_entries = sum(user.entry_count for user in self.users.values())
3875 self.last_updated = datetime.now()
3876</file>
3877
3878<file path="src/thicket/utils/__init__.py">
3879"""Utility modules for thicket."""
3880
3881# This module will contain shared utilities
3882# For now, it's empty but can be expanded with common functions
3883</file>
3884
3885<file path="src/thicket/__init__.py">
3886"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
3887
3888__version__ = "0.1.0"
3889__author__ = "thicket"
3890__email__ = "thicket@example.com"
3891</file>
3892
3893<file path="src/thicket/__main__.py">
3894"""Entry point for running thicket as a module."""
3895
3896from .cli.main import app
3897
3898if __name__ == "__main__":
3899 app()
3900</file>
3901
3902<file path=".gitignore">
3903# Byte-compiled / optimized / DLL files
3904__pycache__/
3905*.py[codz]
3906*$py.class
3907
3908# C extensions
3909*.so
3910
3911# Distribution / packaging
3912.Python
3913build/
3914develop-eggs/
3915dist/
3916downloads/
3917eggs/
3918.eggs/
3919lib/
3920lib64/
3921parts/
3922sdist/
3923var/
3924wheels/
3925share/python-wheels/
3926*.egg-info/
3927.installed.cfg
3928*.egg
3929MANIFEST
3930
3931# PyInstaller
3932# Usually these files are written by a python script from a template
3933# before PyInstaller builds the exe, so as to inject date/other infos into it.
3934*.manifest
3935*.spec
3936
3937# Installer logs
3938pip-log.txt
3939pip-delete-this-directory.txt
3940
3941# Unit test / coverage reports
3942htmlcov/
3943.tox/
3944.nox/
3945.coverage
3946.coverage.*
3947.cache
3948nosetests.xml
3949coverage.xml
3950*.cover
3951*.py.cover
3952.hypothesis/
3953.pytest_cache/
3954cover/
3955
3956# Translations
3957*.mo
3958*.pot
3959
3960# Django stuff:
3961*.log
3962local_settings.py
3963db.sqlite3
3964db.sqlite3-journal
3965
3966# Flask stuff:
3967instance/
3968.webassets-cache
3969
3970# Scrapy stuff:
3971.scrapy
3972
3973# Sphinx documentation
3974docs/_build/
3975
3976# PyBuilder
3977.pybuilder/
3978target/
3979
3980# Jupyter Notebook
3981.ipynb_checkpoints
3982
3983# IPython
3984profile_default/
3985ipython_config.py
3986
3987# pyenv
3988# For a library or package, you might want to ignore these files since the code is
3989# intended to run in multiple environments; otherwise, check them in:
3990# .python-version
3991
3992# pipenv
3993# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
3994# However, in case of collaboration, if having platform-specific dependencies or dependencies
3995# having no cross-platform support, pipenv may install dependencies that don't work, or not
3996# install all needed dependencies.
3997#Pipfile.lock
3998
3999# UV
4000# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
4001# This is especially recommended for binary packages to ensure reproducibility, and is more
4002# commonly ignored for libraries.
4003#uv.lock
4004
4005# poetry
4006# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
4007# This is especially recommended for binary packages to ensure reproducibility, and is more
4008# commonly ignored for libraries.
4009# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
4010#poetry.lock
4011#poetry.toml
4012
4013# pdm
4014# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
4015# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
4016# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
4017#pdm.lock
4018#pdm.toml
4019.pdm-python
4020.pdm-build/
4021
4022# pixi
4023# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
4024#pixi.lock
4025# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
4026# in the .venv directory. It is recommended not to include this directory in version control.
4027.pixi
4028
4029# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
4030__pypackages__/
4031
4032# Celery stuff
4033celerybeat-schedule
4034celerybeat.pid
4035
4036# SageMath parsed files
4037*.sage.py
4038
4039# Environments
4040.env
4041.envrc
4042.venv
4043env/
4044venv/
4045ENV/
4046env.bak/
4047venv.bak/
4048
4049# Spyder project settings
4050.spyderproject
4051.spyproject
4052
4053# Rope project settings
4054.ropeproject
4055
4056# mkdocs documentation
4057/site
4058
4059# mypy
4060.mypy_cache/
4061.dmypy.json
4062dmypy.json
4063
4064# Pyre type checker
4065.pyre/
4066
4067# pytype static type analyzer
4068.pytype/
4069
4070# Cython debug symbols
4071cython_debug/
4072
4073# PyCharm
4074# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
4075# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
4076# and can be added to the global gitignore or merged into this file. For a more nuclear
4077# option (not recommended) you can uncomment the following to ignore the entire idea folder.
4078#.idea/
4079
4080# Abstra
4081# Abstra is an AI-powered process automation framework.
4082# Ignore directories containing user credentials, local state, and settings.
4083# Learn more at https://abstra.io/docs
4084.abstra/
4085
4086# Visual Studio Code
4087# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
4088# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
4089# and can be added to the global gitignore or merged into this file. However, if you prefer,
4090# you could uncomment the following to ignore the entire vscode folder
4091# .vscode/
4092
4093# Ruff stuff:
4094.ruff_cache/
4095
4096# PyPI configuration file
4097.pypirc
4098
4099# Marimo
4100marimo/_static/
4101marimo/_lsp/
4102__marimo__/
4103
4104# Streamlit
4105.streamlit/secrets.toml
4106
4107thicket.yaml
4108</file>
4109
4110<file path="CLAUDE.md">
4111My goal is to build a CLI tool called thicket in Python that maintains a Git repository within which Atom feeds can be persisted, including their contents.
4112
4113# Python Environment and Package Management
4114
4115This project uses `uv` for Python package management and virtual environment handling.
4116
4117## Running Commands
4118
4119ALWAYS use `uv run` to execute Python commands:
4120
4121- Run the CLI: `uv run -m thicket`
4122- Run tests: `uv run pytest`
4123- Type checking: `uv run mypy src/`
4124- Linting: `uv run ruff check src/`
4125- Format code: `uv run ruff format src/`
4126- Compile check: `uv run python -m py_compile <file>`
4127
4128## Package Management
4129
4130- Add dependencies: `uv add <package>`
4131- Add dev dependencies: `uv add --dev <package>`
4132- Install dependencies: `uv sync`
4133- Update dependencies: `uv lock --upgrade`
4134
4135# Project Structure
4136
4137The configuration file specifies:
4138- the location of a git store
4139- a list of usernames and target Atom/RSS feed(s) and optional metadata about the username such as their email, homepage, icon and display name
4140- a cache directory to store temporary results such as feed downloads and their last modification date that speed up operations across runs of the tool
4141
4142The Git data store should:
4143- have a subdirectory per user
4144- within that directory, an entry per Atom entry indexed by the Atom id for that entry. The id should be sanitised consistently to be a safe filename. RSS feed should be normalized to Atom before storing it.
4145- within each entry file, the metadata of the Atom feed converted into a JSON format that preserves as much metadata as possible.
4146- have a JSON file in the Git repository that indexes the users, their associated directories within the Git repository, and any other metadata about that user from the config file
4147The CLI should be modern and use cool progress bars and any otfrom ecosystem libraries.
4148
4149The intention behind the Git repository is that it can be queried by other websites in order to build a webblog structure of comments that link to other blogs.
4150</file>
4151
4152<file path="pyproject.toml">
4153[build-system]
4154requires = ["hatchling"]
4155build-backend = "hatchling.build"
4156
4157[project]
4158name = "thicket"
4159dynamic = ["version"]
4160description = "A CLI tool for persisting Atom/RSS feeds in Git repositories"
4161readme = "README.md"
4162license = "MIT"
4163requires-python = ">=3.9"
4164authors = [
4165 {name = "thicket", email = "thicket@example.com"},
4166]
4167classifiers = [
4168 "Development Status :: 3 - Alpha",
4169 "Intended Audience :: Developers",
4170 "License :: OSI Approved :: MIT License",
4171 "Operating System :: OS Independent",
4172 "Programming Language :: Python :: 3",
4173 "Programming Language :: Python :: 3.9",
4174 "Programming Language :: Python :: 3.10",
4175 "Programming Language :: Python :: 3.11",
4176 "Programming Language :: Python :: 3.12",
4177 "Programming Language :: Python :: 3.13",
4178 "Topic :: Internet :: WWW/HTTP :: Dynamic Content :: News/Diary",
4179 "Topic :: Software Development :: Version Control :: Git",
4180 "Topic :: Text Processing :: Markup :: XML",
4181]
4182dependencies = [
4183 "typer>=0.15.0",
4184 "rich>=13.0.0",
4185 "GitPython>=3.1.40",
4186 "feedparser>=6.0.11",
4187 "pydantic>=2.11.0",
4188 "pydantic-settings>=2.10.0",
4189 "httpx>=0.28.0",
4190 "pendulum>=3.0.0",
4191 "bleach>=6.0.0",
4192 "platformdirs>=4.0.0",
4193 "pyyaml>=6.0.0",
4194 "email_validator",
4195 "jinja2>=3.1.6",
4196]
4197
4198[project.optional-dependencies]
4199dev = [
4200 "pytest>=8.0.0",
4201 "pytest-asyncio>=0.24.0",
4202 "pytest-cov>=6.0.0",
4203 "black>=24.0.0",
4204 "ruff>=0.8.0",
4205 "mypy>=1.13.0",
4206 "types-PyYAML>=6.0.0",
4207]
4208
4209[project.urls]
4210Homepage = "https://github.com/example/thicket"
4211Documentation = "https://github.com/example/thicket"
4212Repository = "https://github.com/example/thicket"
4213"Bug Tracker" = "https://github.com/example/thicket/issues"
4214
4215[project.scripts]
4216thicket = "thicket.cli.main:app"
4217
4218[tool.hatch.version]
4219path = "src/thicket/__init__.py"
4220
4221[tool.hatch.build.targets.wheel]
4222packages = ["src/thicket"]
4223
4224[tool.black]
4225line-length = 88
4226target-version = ['py39']
4227include = '\.pyi?$'
4228extend-exclude = '''
4229/(
4230 # directories
4231 \.eggs
4232 | \.git
4233 | \.hg
4234 | \.mypy_cache
4235 | \.tox
4236 | \.venv
4237 | build
4238 | dist
4239)/
4240'''
4241
4242[tool.ruff]
4243target-version = "py39"
4244line-length = 88
4245
4246[tool.ruff.lint]
4247select = [
4248 "E", # pycodestyle errors
4249 "W", # pycodestyle warnings
4250 "F", # pyflakes
4251 "I", # isort
4252 "B", # flake8-bugbear
4253 "C4", # flake8-comprehensions
4254 "UP", # pyupgrade
4255]
4256ignore = [
4257 "E501", # line too long, handled by black
4258 "B008", # do not perform function calls in argument defaults
4259 "C901", # too complex
4260]
4261
4262[tool.ruff.lint.per-file-ignores]
4263"__init__.py" = ["F401"]
4264
4265[tool.mypy]
4266python_version = "3.9"
4267check_untyped_defs = true
4268disallow_any_generics = true
4269disallow_incomplete_defs = true
4270disallow_untyped_defs = true
4271no_implicit_optional = true
4272warn_redundant_casts = true
4273warn_unused_ignores = true
4274warn_return_any = true
4275strict_optional = true
4276
4277[[tool.mypy.overrides]]
4278module = [
4279 "feedparser",
4280 "git",
4281 "bleach",
4282]
4283ignore_missing_imports = true
4284
4285[tool.pytest.ini_options]
4286testpaths = ["tests"]
4287python_files = ["test_*.py"]
4288python_classes = ["Test*"]
4289python_functions = ["test_*"]
4290addopts = [
4291 "-ra",
4292 "--strict-markers",
4293 "--strict-config",
4294 "--cov=src/thicket",
4295 "--cov-report=term-missing",
4296 "--cov-report=html",
4297 "--cov-report=xml",
4298]
4299filterwarnings = [
4300 "error",
4301 "ignore::UserWarning",
4302 "ignore::DeprecationWarning",
4303]
4304markers = [
4305 "slow: marks tests as slow (deselect with '-m \"not slow\"')",
4306 "integration: marks tests as integration tests",
4307]
4308
4309[tool.coverage.run]
4310source = ["src"]
4311branch = true
4312
4313[tool.coverage.report]
4314exclude_lines = [
4315 "pragma: no cover",
4316 "def __repr__",
4317 "if self.debug:",
4318 "if settings.DEBUG",
4319 "raise AssertionError",
4320 "raise NotImplementedError",
4321 "if 0:",
4322 "if __name__ == .__main__.:",
4323 "class .*\\bProtocol\\):",
4324 "@(abc\\.)?abstractmethod",
4325]
4326</file>
4327
4328<file path="src/thicket/cli/commands/__init__.py">
4329"""CLI commands for thicket."""
4330
4331# Import all commands to register them with the main app
4332from . import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
4333
4334__all__ = ["add", "duplicates", "generate", "index_cmd", "info_cmd", "init", "links_cmd", "list_cmd", "sync"]
4335</file>
4336
4337<file path="src/thicket/cli/commands/add.py">
4338"""Add command for thicket."""
4339
4340import asyncio
4341from pathlib import Path
4342from typing import Optional
4343
4344import typer
4345from pydantic import HttpUrl, ValidationError
4346
4347from ...core.feed_parser import FeedParser
4348from ...core.git_store import GitStore
4349from ..main import app
4350from ..utils import (
4351 create_progress,
4352 load_config,
4353 print_error,
4354 print_info,
4355 print_success,
4356)
4357
4358
4359@app.command("add")
4360def add_command(
4361 subcommand: str = typer.Argument(..., help="Subcommand: 'user' or 'feed'"),
4362 username: str = typer.Argument(..., help="Username"),
4363 feed_url: Optional[str] = typer.Argument(None, help="Feed URL (required for 'user' command)"),
4364 email: Optional[str] = typer.Option(None, "--email", "-e", help="User email"),
4365 homepage: Optional[str] = typer.Option(None, "--homepage", "-h", help="User homepage"),
4366 icon: Optional[str] = typer.Option(None, "--icon", "-i", help="User icon URL"),
4367 display_name: Optional[str] = typer.Option(None, "--display-name", "-d", help="User display name"),
4368 config_file: Optional[Path] = typer.Option(
4369 Path("thicket.yaml"), "--config", help="Configuration file path"
4370 ),
4371 auto_discover: bool = typer.Option(
4372 True, "--auto-discover/--no-auto-discover", help="Auto-discover user metadata from feed"
4373 ),
4374) -> None:
4375 """Add a user or feed to thicket."""
4376
4377 if subcommand == "user":
4378 add_user(username, feed_url, email, homepage, icon, display_name, config_file, auto_discover)
4379 elif subcommand == "feed":
4380 add_feed(username, feed_url, config_file)
4381 else:
4382 print_error(f"Unknown subcommand: {subcommand}")
4383 print_error("Use 'user' or 'feed'")
4384 raise typer.Exit(1)
4385
4386
4387def add_user(
4388 username: str,
4389 feed_url: Optional[str],
4390 email: Optional[str],
4391 homepage: Optional[str],
4392 icon: Optional[str],
4393 display_name: Optional[str],
4394 config_file: Path,
4395 auto_discover: bool,
4396) -> None:
4397 """Add a new user with feed."""
4398
4399 if not feed_url:
4400 print_error("Feed URL is required when adding a user")
4401 raise typer.Exit(1)
4402
4403 # Validate feed URL
4404 try:
4405 validated_feed_url = HttpUrl(feed_url)
4406 except ValidationError:
4407 print_error(f"Invalid feed URL: {feed_url}")
4408 raise typer.Exit(1) from None
4409
4410 # Load configuration
4411 config = load_config(config_file)
4412
4413 # Initialize Git store
4414 git_store = GitStore(config.git_store)
4415
4416 # Check if user already exists
4417 existing_user = git_store.get_user(username)
4418 if existing_user:
4419 print_error(f"User '{username}' already exists")
4420 print_error("Use 'thicket add feed' to add additional feeds")
4421 raise typer.Exit(1)
4422
4423 # Auto-discover metadata if enabled
4424 discovered_metadata = None
4425 if auto_discover:
4426 discovered_metadata = asyncio.run(discover_feed_metadata(validated_feed_url))
4427
4428 # Prepare user data with manual overrides taking precedence
4429 user_display_name = display_name or (discovered_metadata.author_name or discovered_metadata.title if discovered_metadata else None)
4430 user_email = email or (discovered_metadata.author_email if discovered_metadata else None)
4431 user_homepage = homepage or (str(discovered_metadata.author_uri or discovered_metadata.link) if discovered_metadata else None)
4432 user_icon = icon or (str(discovered_metadata.logo or discovered_metadata.icon or discovered_metadata.image_url) if discovered_metadata else None)
4433
4434 # Add user to Git store
4435 git_store.add_user(
4436 username=username,
4437 display_name=user_display_name,
4438 email=user_email,
4439 homepage=user_homepage,
4440 icon=user_icon,
4441 feeds=[str(validated_feed_url)],
4442 )
4443
4444 # Commit changes
4445 git_store.commit_changes(f"Add user: {username}")
4446
4447 print_success(f"Added user '{username}' with feed: {feed_url}")
4448
4449 if discovered_metadata and auto_discover:
4450 print_info("Auto-discovered metadata:")
4451 if user_display_name:
4452 print_info(f" Display name: {user_display_name}")
4453 if user_email:
4454 print_info(f" Email: {user_email}")
4455 if user_homepage:
4456 print_info(f" Homepage: {user_homepage}")
4457 if user_icon:
4458 print_info(f" Icon: {user_icon}")
4459
4460
4461def add_feed(username: str, feed_url: Optional[str], config_file: Path) -> None:
4462 """Add a feed to an existing user."""
4463
4464 if not feed_url:
4465 print_error("Feed URL is required")
4466 raise typer.Exit(1)
4467
4468 # Validate feed URL
4469 try:
4470 validated_feed_url = HttpUrl(feed_url)
4471 except ValidationError:
4472 print_error(f"Invalid feed URL: {feed_url}")
4473 raise typer.Exit(1) from None
4474
4475 # Load configuration
4476 config = load_config(config_file)
4477
4478 # Initialize Git store
4479 git_store = GitStore(config.git_store)
4480
4481 # Check if user exists
4482 user = git_store.get_user(username)
4483 if not user:
4484 print_error(f"User '{username}' not found")
4485 print_error("Use 'thicket add user' to add a new user")
4486 raise typer.Exit(1)
4487
4488 # Check if feed already exists
4489 if str(validated_feed_url) in user.feeds:
4490 print_error(f"Feed already exists for user '{username}': {feed_url}")
4491 raise typer.Exit(1)
4492
4493 # Add feed to user
4494 updated_feeds = user.feeds + [str(validated_feed_url)]
4495 if git_store.update_user(username, feeds=updated_feeds):
4496 git_store.commit_changes(f"Add feed to user {username}: {feed_url}")
4497 print_success(f"Added feed to user '{username}': {feed_url}")
4498 else:
4499 print_error(f"Failed to add feed to user '{username}'")
4500 raise typer.Exit(1)
4501
4502
4503async def discover_feed_metadata(feed_url: HttpUrl):
4504 """Discover metadata from a feed URL."""
4505 try:
4506 with create_progress() as progress:
4507 task = progress.add_task("Discovering feed metadata...", total=None)
4508
4509 parser = FeedParser()
4510 content = await parser.fetch_feed(feed_url)
4511 metadata, _ = parser.parse_feed(content, feed_url)
4512
4513 progress.update(task, completed=True)
4514 return metadata
4515
4516 except Exception as e:
4517 print_error(f"Failed to discover feed metadata: {e}")
4518 return None
4519</file>
4520
4521<file path="src/thicket/cli/commands/duplicates.py">
4522"""Duplicates command for thicket."""
4523
4524from pathlib import Path
4525from typing import Optional
4526
4527import typer
4528from rich.table import Table
4529
4530from ...core.git_store import GitStore
4531from ..main import app
4532from ..utils import (
4533 console,
4534 load_config,
4535 print_error,
4536 print_info,
4537 print_success,
4538 get_tsv_mode,
4539)
4540
4541
4542@app.command("duplicates")
4543def duplicates_command(
4544 action: str = typer.Argument(..., help="Action: 'list', 'add', 'remove'"),
4545 duplicate_id: Optional[str] = typer.Argument(None, help="Duplicate entry ID"),
4546 canonical_id: Optional[str] = typer.Argument(None, help="Canonical entry ID"),
4547 config_file: Optional[Path] = typer.Option(
4548 Path("thicket.yaml"), "--config", help="Configuration file path"
4549 ),
4550) -> None:
4551 """Manage duplicate entry mappings."""
4552
4553 # Load configuration
4554 config = load_config(config_file)
4555
4556 # Initialize Git store
4557 git_store = GitStore(config.git_store)
4558
4559 if action == "list":
4560 list_duplicates(git_store)
4561 elif action == "add":
4562 add_duplicate(git_store, duplicate_id, canonical_id)
4563 elif action == "remove":
4564 remove_duplicate(git_store, duplicate_id)
4565 else:
4566 print_error(f"Unknown action: {action}")
4567 print_error("Use 'list', 'add', or 'remove'")
4568 raise typer.Exit(1)
4569
4570
4571def list_duplicates(git_store: GitStore) -> None:
4572 """List all duplicate mappings."""
4573 duplicates = git_store.get_duplicates()
4574
4575 if not duplicates.duplicates:
4576 if get_tsv_mode():
4577 print("No duplicate mappings found")
4578 else:
4579 print_info("No duplicate mappings found")
4580 return
4581
4582 if get_tsv_mode():
4583 print("Duplicate ID\tCanonical ID")
4584 for duplicate_id, canonical_id in duplicates.duplicates.items():
4585 print(f"{duplicate_id}\t{canonical_id}")
4586 print(f"Total duplicates: {len(duplicates.duplicates)}")
4587 else:
4588 table = Table(title="Duplicate Entry Mappings")
4589 table.add_column("Duplicate ID", style="red")
4590 table.add_column("Canonical ID", style="green")
4591
4592 for duplicate_id, canonical_id in duplicates.duplicates.items():
4593 table.add_row(duplicate_id, canonical_id)
4594
4595 console.print(table)
4596 print_info(f"Total duplicates: {len(duplicates.duplicates)}")
4597
4598
4599def add_duplicate(git_store: GitStore, duplicate_id: Optional[str], canonical_id: Optional[str]) -> None:
4600 """Add a duplicate mapping."""
4601 if not duplicate_id:
4602 print_error("Duplicate ID is required")
4603 raise typer.Exit(1)
4604
4605 if not canonical_id:
4606 print_error("Canonical ID is required")
4607 raise typer.Exit(1)
4608
4609 # Check if duplicate_id already exists
4610 duplicates = git_store.get_duplicates()
4611 if duplicates.is_duplicate(duplicate_id):
4612 existing_canonical = duplicates.get_canonical(duplicate_id)
4613 print_error(f"Duplicate ID already mapped to: {existing_canonical}")
4614 print_error("Use 'remove' first to change the mapping")
4615 raise typer.Exit(1)
4616
4617 # Check if we're trying to make a canonical ID point to itself
4618 if duplicate_id == canonical_id:
4619 print_error("Duplicate ID cannot be the same as canonical ID")
4620 raise typer.Exit(1)
4621
4622 # Add the mapping
4623 git_store.add_duplicate(duplicate_id, canonical_id)
4624
4625 # Commit changes
4626 git_store.commit_changes(f"Add duplicate mapping: {duplicate_id} -> {canonical_id}")
4627
4628 print_success(f"Added duplicate mapping: {duplicate_id} -> {canonical_id}")
4629
4630
4631def remove_duplicate(git_store: GitStore, duplicate_id: Optional[str]) -> None:
4632 """Remove a duplicate mapping."""
4633 if not duplicate_id:
4634 print_error("Duplicate ID is required")
4635 raise typer.Exit(1)
4636
4637 # Check if mapping exists
4638 duplicates = git_store.get_duplicates()
4639 if not duplicates.is_duplicate(duplicate_id):
4640 print_error(f"No duplicate mapping found for: {duplicate_id}")
4641 raise typer.Exit(1)
4642
4643 canonical_id = duplicates.get_canonical(duplicate_id)
4644
4645 # Remove the mapping
4646 if git_store.remove_duplicate(duplicate_id):
4647 # Commit changes
4648 git_store.commit_changes(f"Remove duplicate mapping: {duplicate_id} -> {canonical_id}")
4649 print_success(f"Removed duplicate mapping: {duplicate_id} -> {canonical_id}")
4650 else:
4651 print_error(f"Failed to remove duplicate mapping: {duplicate_id}")
4652 raise typer.Exit(1)
4653</file>
4654
4655<file path="src/thicket/cli/commands/sync.py">
4656"""Sync command for thicket."""
4657
4658import asyncio
4659from pathlib import Path
4660from typing import Optional
4661
4662import typer
4663from rich.progress import track
4664
4665from ...core.feed_parser import FeedParser
4666from ...core.git_store import GitStore
4667from ..main import app
4668from ..utils import (
4669 load_config,
4670 print_error,
4671 print_info,
4672 print_success,
4673)
4674
4675
4676@app.command()
4677def sync(
4678 all_users: bool = typer.Option(
4679 False, "--all", "-a", help="Sync all users and feeds"
4680 ),
4681 user: Optional[str] = typer.Option(
4682 None, "--user", "-u", help="Sync specific user only"
4683 ),
4684 config_file: Optional[Path] = typer.Option(
4685 Path("thicket.yaml"), "--config", help="Configuration file path"
4686 ),
4687 dry_run: bool = typer.Option(
4688 False, "--dry-run", help="Show what would be synced without making changes"
4689 ),
4690) -> None:
4691 """Sync feeds and store entries in Git repository."""
4692
4693 # Load configuration
4694 config = load_config(config_file)
4695
4696 # Initialize Git store
4697 git_store = GitStore(config.git_store)
4698
4699 # Determine which users to sync from git repository
4700 users_to_sync = []
4701 if all_users:
4702 index = git_store._load_index()
4703 users_to_sync = list(index.users.values())
4704 elif user:
4705 user_metadata = git_store.get_user(user)
4706 if not user_metadata:
4707 print_error(f"User '{user}' not found in git repository")
4708 raise typer.Exit(1)
4709 users_to_sync = [user_metadata]
4710 else:
4711 print_error("Specify --all to sync all users or --user to sync a specific user")
4712 raise typer.Exit(1)
4713
4714 if not users_to_sync:
4715 print_info("No users configured to sync")
4716 return
4717
4718 # Sync each user
4719 total_new_entries = 0
4720 total_updated_entries = 0
4721
4722 for user_metadata in users_to_sync:
4723 print_info(f"Syncing user: {user_metadata.username}")
4724
4725 user_new_entries = 0
4726 user_updated_entries = 0
4727
4728 # Sync each feed for the user
4729 for feed_url in track(user_metadata.feeds, description=f"Syncing {user_metadata.username}'s feeds"):
4730 try:
4731 new_entries, updated_entries = asyncio.run(
4732 sync_feed(git_store, user_metadata.username, feed_url, dry_run)
4733 )
4734 user_new_entries += new_entries
4735 user_updated_entries += updated_entries
4736
4737 except Exception as e:
4738 print_error(f"Failed to sync feed {feed_url}: {e}")
4739 continue
4740
4741 print_info(f"User {user_metadata.username}: {user_new_entries} new, {user_updated_entries} updated")
4742 total_new_entries += user_new_entries
4743 total_updated_entries += user_updated_entries
4744
4745 # Commit changes if not dry run
4746 if not dry_run and (total_new_entries > 0 or total_updated_entries > 0):
4747 commit_message = f"Sync feeds: {total_new_entries} new entries, {total_updated_entries} updated"
4748 git_store.commit_changes(commit_message)
4749 print_success(f"Committed changes: {commit_message}")
4750
4751 # Summary
4752 if dry_run:
4753 print_info(f"Dry run complete: would sync {total_new_entries} new entries, {total_updated_entries} updated")
4754 else:
4755 print_success(f"Sync complete: {total_new_entries} new entries, {total_updated_entries} updated")
4756
4757
4758async def sync_feed(git_store: GitStore, username: str, feed_url, dry_run: bool) -> tuple[int, int]:
4759 """Sync a single feed for a user."""
4760
4761 parser = FeedParser()
4762
4763 try:
4764 # Fetch and parse feed
4765 content = await parser.fetch_feed(feed_url)
4766 metadata, entries = parser.parse_feed(content, feed_url)
4767
4768 new_entries = 0
4769 updated_entries = 0
4770
4771 # Process each entry
4772 for entry in entries:
4773 try:
4774 # Check if entry already exists
4775 existing_entry = git_store.get_entry(username, entry.id)
4776
4777 if existing_entry:
4778 # Check if entry has been updated
4779 if existing_entry.updated != entry.updated:
4780 if not dry_run:
4781 git_store.store_entry(username, entry)
4782 updated_entries += 1
4783 else:
4784 # New entry
4785 if not dry_run:
4786 git_store.store_entry(username, entry)
4787 new_entries += 1
4788
4789 except Exception as e:
4790 print_error(f"Failed to process entry {entry.id}: {e}")
4791 continue
4792
4793 return new_entries, updated_entries
4794
4795 except Exception as e:
4796 print_error(f"Failed to sync feed {feed_url}: {e}")
4797 return 0, 0
4798</file>
4799
4800<file path="src/thicket/models/config.py">
4801"""Configuration models for thicket."""
4802
4803from pathlib import Path
4804from typing import Optional
4805
4806from pydantic import BaseModel, EmailStr, HttpUrl
4807from pydantic_settings import BaseSettings, SettingsConfigDict
4808
4809
4810class UserConfig(BaseModel):
4811 """Configuration for a single user and their feeds."""
4812
4813 username: str
4814 feeds: list[HttpUrl]
4815 email: Optional[EmailStr] = None
4816 homepage: Optional[HttpUrl] = None
4817 icon: Optional[HttpUrl] = None
4818 display_name: Optional[str] = None
4819
4820
4821class ThicketConfig(BaseSettings):
4822 """Main configuration for thicket."""
4823
4824 model_config = SettingsConfigDict(
4825 env_prefix="THICKET_",
4826 env_file=".env",
4827 yaml_file="thicket.yaml",
4828 case_sensitive=False,
4829 )
4830
4831 git_store: Path
4832 cache_dir: Path
4833 users: list[UserConfig] = []
4834</file>
4835
4836<file path="src/thicket/cli/commands/links_cmd.py">
4837"""CLI command for extracting and categorizing all outbound links from blog entries."""
4838
4839import json
4840import re
4841from pathlib import Path
4842from typing import Dict, List, Optional, Set
4843from urllib.parse import urljoin, urlparse
4844
4845import typer
4846from rich.console import Console
4847from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
4848from rich.table import Table
4849
4850from ...core.git_store import GitStore
4851from ..main import app
4852from ..utils import load_config, get_tsv_mode
4853
4854console = Console()
4855
4856
4857class LinkData:
4858 """Represents a link found in a blog entry."""
4859
4860 def __init__(self, url: str, entry_id: str, username: str):
4861 self.url = url
4862 self.entry_id = entry_id
4863 self.username = username
4864
4865 def to_dict(self) -> dict:
4866 """Convert to dictionary for JSON serialization."""
4867 return {
4868 "url": self.url,
4869 "entry_id": self.entry_id,
4870 "username": self.username
4871 }
4872
4873 @classmethod
4874 def from_dict(cls, data: dict) -> "LinkData":
4875 """Create from dictionary."""
4876 return cls(
4877 url=data["url"],
4878 entry_id=data["entry_id"],
4879 username=data["username"]
4880 )
4881
4882
4883class LinkCategorizer:
4884 """Categorizes links as internal, user, or unknown."""
4885
4886 def __init__(self, user_domains: Dict[str, Set[str]]):
4887 self.user_domains = user_domains
4888 # Create reverse mapping of domain -> username
4889 self.domain_to_user = {}
4890 for username, domains in user_domains.items():
4891 for domain in domains:
4892 self.domain_to_user[domain] = username
4893
4894 def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:
4895 """
4896 Categorize a URL as 'internal', 'user', or 'unknown'.
4897 Returns (category, target_username).
4898 """
4899 try:
4900 parsed = urlparse(url)
4901 domain = parsed.netloc.lower()
4902
4903 # Check if it's a link to the same user's domain (internal)
4904 if domain in self.user_domains.get(source_username, set()):
4905 return "internal", source_username
4906
4907 # Check if it's a link to another user's domain
4908 if domain in self.domain_to_user:
4909 return "user", self.domain_to_user[domain]
4910
4911 # Everything else is unknown
4912 return "unknown", None
4913
4914 except Exception:
4915 return "unknown", None
4916
4917
4918class LinkExtractor:
4919 """Extracts and resolves links from blog entries."""
4920
4921 def __init__(self):
4922 # Pattern for extracting links from HTML
4923 self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
4924 self.url_pattern = re.compile(r'https?://[^\s<>"]+')
4925
4926 def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:
4927 """Extract all links from HTML content and resolve them against base URL."""
4928 links = []
4929
4930 # Extract links from <a> tags
4931 for match in self.link_pattern.finditer(html_content):
4932 url = match.group(1)
4933 text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
4934
4935 # Resolve relative URLs against base URL
4936 resolved_url = urljoin(base_url, url)
4937 links.append((resolved_url, text))
4938
4939 return links
4940
4941
4942 def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:
4943 """Extract all links from a blog entry."""
4944 links = []
4945
4946 # Combine all text content for analysis
4947 content_to_search = []
4948 if entry.content:
4949 content_to_search.append(entry.content)
4950 if entry.summary:
4951 content_to_search.append(entry.summary)
4952
4953 for content in content_to_search:
4954 extracted_links = self.extract_links_from_html(content, base_url)
4955
4956 for url, link_text in extracted_links:
4957 # Skip empty URLs
4958 if not url or url.startswith('#'):
4959 continue
4960
4961 link_data = LinkData(
4962 url=url,
4963 entry_id=entry.id,
4964 username=username
4965 )
4966
4967 links.append(link_data)
4968
4969 return links
4970
4971
4972@app.command()
4973def links(
4974 config_file: Optional[Path] = typer.Option(
4975 Path("thicket.yaml"),
4976 "--config",
4977 "-c",
4978 help="Path to configuration file",
4979 ),
4980 output_file: Optional[Path] = typer.Option(
4981 None,
4982 "--output",
4983 "-o",
4984 help="Path to output unified links file (default: links.json in git store)",
4985 ),
4986 verbose: bool = typer.Option(
4987 False,
4988 "--verbose",
4989 "-v",
4990 help="Show detailed progress information",
4991 ),
4992) -> None:
4993 """Extract and categorize all outbound links from blog entries.
4994
4995 This command analyzes all blog entries to extract outbound links,
4996 resolve them properly with respect to the feed's base URL, and
4997 categorize them as internal, user, or unknown links.
4998
4999 Creates a unified links.json file containing all link data.
5000 """
5001 try:
5002 # Load configuration
5003 config = load_config(config_file)
5004
5005 # Initialize Git store
5006 git_store = GitStore(config.git_store)
5007
5008 # Build user domain mapping
5009 if verbose:
5010 console.print("Building user domain mapping...")
5011
5012 index = git_store._load_index()
5013 user_domains = {}
5014
5015 for username, user_metadata in index.users.items():
5016 domains = set()
5017
5018 # Add domains from feeds
5019 for feed_url in user_metadata.feeds:
5020 domain = urlparse(feed_url).netloc.lower()
5021 if domain:
5022 domains.add(domain)
5023
5024 # Add domain from homepage
5025 if user_metadata.homepage:
5026 domain = urlparse(str(user_metadata.homepage)).netloc.lower()
5027 if domain:
5028 domains.add(domain)
5029
5030 user_domains[username] = domains
5031
5032 if verbose:
5033 console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
5034
5035 # Initialize components
5036 link_extractor = LinkExtractor()
5037 categorizer = LinkCategorizer(user_domains)
5038
5039 # Get all users
5040 users = list(index.users.keys())
5041
5042 if not users:
5043 console.print("[yellow]No users found in Git store[/yellow]")
5044 raise typer.Exit(0)
5045
5046 # Process all entries
5047 all_links = []
5048 link_categories = {"internal": [], "user": [], "unknown": []}
5049 link_dict = {} # Dictionary with link URL as key, maps to list of atom IDs
5050 reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs
5051
5052 with Progress(
5053 SpinnerColumn(),
5054 TextColumn("[progress.description]{task.description}"),
5055 BarColumn(),
5056 TaskProgressColumn(),
5057 console=console,
5058 ) as progress:
5059
5060 # Count total entries first
5061 counting_task = progress.add_task("Counting entries...", total=len(users))
5062 total_entries = 0
5063
5064 for username in users:
5065 entries = git_store.list_entries(username)
5066 total_entries += len(entries)
5067 progress.advance(counting_task)
5068
5069 progress.remove_task(counting_task)
5070
5071 # Process entries
5072 processing_task = progress.add_task(
5073 f"Processing {total_entries} entries...",
5074 total=total_entries
5075 )
5076
5077 for username in users:
5078 entries = git_store.list_entries(username)
5079 user_metadata = index.users[username]
5080
5081 # Get base URL for this user (use first feed URL)
5082 base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"
5083
5084 for entry in entries:
5085 # Extract links from this entry
5086 entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)
5087
5088 # Track unique links per entry
5089 entry_urls_seen = set()
5090
5091 # Categorize each link
5092 for link_data in entry_links:
5093 # Skip if we've already seen this URL in this entry
5094 if link_data.url in entry_urls_seen:
5095 continue
5096 entry_urls_seen.add(link_data.url)
5097
5098 category, target_username = categorizer.categorize_url(link_data.url, username)
5099
5100 # Add to link dictionary (URL as key, maps to list of atom IDs)
5101 if link_data.url not in link_dict:
5102 link_dict[link_data.url] = []
5103 if link_data.entry_id not in link_dict[link_data.url]:
5104 link_dict[link_data.url].append(link_data.entry_id)
5105
5106 # Also add to reverse mapping (atom ID -> list of URLs)
5107 if link_data.entry_id not in reverse_dict:
5108 reverse_dict[link_data.entry_id] = []
5109 if link_data.url not in reverse_dict[link_data.entry_id]:
5110 reverse_dict[link_data.entry_id].append(link_data.url)
5111
5112 # Add category info to link data for categories tracking
5113 link_info = link_data.to_dict()
5114 link_info["category"] = category
5115 link_info["target_username"] = target_username
5116
5117 all_links.append(link_info)
5118 link_categories[category].append(link_info)
5119
5120 progress.advance(processing_task)
5121
5122 if verbose and entry_links:
5123 console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...")
5124
5125 # Determine output path
5126 if output_file:
5127 output_path = output_file
5128 else:
5129 output_path = config.git_store / "links.json"
5130
5131 # Save all extracted links (not just filtered ones)
5132 if verbose:
5133 console.print("Preparing output data...")
5134
5135 # Build a set of all URLs that correspond to posts in the git database
5136 registered_urls = set()
5137
5138 # Get all entries from all users and build URL mappings
5139 for username in users:
5140 entries = git_store.list_entries(username)
5141 user_metadata = index.users[username]
5142
5143 for entry in entries:
5144 # Try to match entry URLs with extracted links
5145 if hasattr(entry, 'link') and entry.link:
5146 registered_urls.add(str(entry.link))
5147
5148 # Also check entry alternate links if they exist
5149 if hasattr(entry, 'links') and entry.links:
5150 for link in entry.links:
5151 if hasattr(link, 'href') and link.href:
5152 registered_urls.add(str(link.href))
5153
5154 # Build unified structure with metadata
5155 unified_links = {}
5156 reverse_mapping = {}
5157
5158 for url, entry_ids in link_dict.items():
5159 unified_links[url] = {
5160 "referencing_entries": entry_ids
5161 }
5162
5163 # Find target username if this is a tracked post
5164 if url in registered_urls:
5165 for username in users:
5166 user_domains_set = {domain for domain in user_domains.get(username, [])}
5167 if any(domain in url for domain in user_domains_set):
5168 unified_links[url]["target_username"] = username
5169 break
5170
5171 # Build reverse mapping
5172 for entry_id in entry_ids:
5173 if entry_id not in reverse_mapping:
5174 reverse_mapping[entry_id] = []
5175 if url not in reverse_mapping[entry_id]:
5176 reverse_mapping[entry_id].append(url)
5177
5178 # Create unified output data
5179 output_data = {
5180 "links": unified_links,
5181 "reverse_mapping": reverse_mapping,
5182 "user_domains": {k: list(v) for k, v in user_domains.items()}
5183 }
5184
5185 if verbose:
5186 console.print(f"Found {len(registered_urls)} registered post URLs")
5187 console.print(f"Found {len(link_dict)} total links, {sum(1 for link in unified_links.values() if 'target_username' in link)} tracked posts")
5188
5189 # Save unified data
5190 with open(output_path, "w") as f:
5191 json.dump(output_data, f, indent=2, default=str)
5192
5193 # Show summary
5194 if not get_tsv_mode():
5195 console.print("\n[green]✓ Links extraction completed successfully[/green]")
5196
5197 # Create summary table or TSV output
5198 if get_tsv_mode():
5199 print("Category\tCount\tDescription")
5200 print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")
5201 print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")
5202 print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")
5203 print(f"Total Extracted\t{len(all_links)}\tAll extracted links")
5204 print(f"Saved to Output\t{len(output_data['links'])}\tLinks saved to output file")
5205 print(f"Cross-references\t{sum(1 for link in unified_links.values() if 'target_username' in link)}\tLinks to registered posts only")
5206 else:
5207 table = Table(title="Links Summary")
5208 table.add_column("Category", style="cyan")
5209 table.add_column("Count", style="green")
5210 table.add_column("Description", style="white")
5211
5212 table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")
5213 table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")
5214 table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")
5215 table.add_row("Total Extracted", str(len(all_links)), "All extracted links")
5216 table.add_row("Saved to Output", str(len(output_data['links'])), "Links saved to output file")
5217 table.add_row("Cross-references", str(sum(1 for link in unified_links.values() if 'target_username' in link)), "Links to registered posts only")
5218
5219 console.print(table)
5220
5221 # Show user links if verbose
5222 if verbose and link_categories["user"]:
5223 if get_tsv_mode():
5224 print("User Link Source\tUser Link Target\tLink Count")
5225 user_link_counts = {}
5226
5227 for link in link_categories["user"]:
5228 key = f"{link['username']} -> {link['target_username']}"
5229 user_link_counts[key] = user_link_counts.get(key, 0) + 1
5230
5231 for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
5232 source, target = link_pair.split(" -> ")
5233 print(f"{source}\t{target}\t{count}")
5234 else:
5235 console.print("\n[bold]User-to-user links:[/bold]")
5236 user_link_counts = {}
5237
5238 for link in link_categories["user"]:
5239 key = f"{link['username']} -> {link['target_username']}"
5240 user_link_counts[key] = user_link_counts.get(key, 0) + 1
5241
5242 for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
5243 console.print(f" {link_pair}: {count} links")
5244
5245 if not get_tsv_mode():
5246 console.print(f"\nUnified links data saved to: {output_path}")
5247
5248 except Exception as e:
5249 console.print(f"[red]Error extracting links: {e}[/red]")
5250 if verbose:
5251 console.print_exception()
5252 raise typer.Exit(1)
5253</file>
5254
5255<file path="src/thicket/cli/commands/list_cmd.py">
5256"""List command for thicket."""
5257
5258import re
5259from pathlib import Path
5260from typing import Optional
5261
5262import typer
5263from rich.table import Table
5264
5265from ...core.git_store import GitStore
5266from ..main import app
5267from ..utils import (
5268 console,
5269 load_config,
5270 print_error,
5271 print_feeds_table,
5272 print_feeds_table_from_git,
5273 print_info,
5274 print_users_table,
5275 print_users_table_from_git,
5276 print_entries_tsv,
5277 get_tsv_mode,
5278)
5279
5280
5281@app.command("list")
5282def list_command(
5283 what: str = typer.Argument(..., help="What to list: 'users', 'feeds', 'entries'"),
5284 user: Optional[str] = typer.Option(
5285 None, "--user", "-u", help="Filter by specific user"
5286 ),
5287 limit: Optional[int] = typer.Option(
5288 None, "--limit", "-l", help="Limit number of results"
5289 ),
5290 config_file: Optional[Path] = typer.Option(
5291 Path("thicket.yaml"), "--config", help="Configuration file path"
5292 ),
5293) -> None:
5294 """List users, feeds, or entries."""
5295
5296 # Load configuration
5297 config = load_config(config_file)
5298
5299 # Initialize Git store
5300 git_store = GitStore(config.git_store)
5301
5302 if what == "users":
5303 list_users(git_store)
5304 elif what == "feeds":
5305 list_feeds(git_store, user)
5306 elif what == "entries":
5307 list_entries(git_store, user, limit)
5308 else:
5309 print_error(f"Unknown list type: {what}")
5310 print_error("Use 'users', 'feeds', or 'entries'")
5311 raise typer.Exit(1)
5312
5313
5314def list_users(git_store: GitStore) -> None:
5315 """List all users."""
5316 index = git_store._load_index()
5317 users = list(index.users.values())
5318
5319 if not users:
5320 print_info("No users configured")
5321 return
5322
5323 print_users_table_from_git(users)
5324
5325
5326def list_feeds(git_store: GitStore, username: Optional[str] = None) -> None:
5327 """List feeds, optionally filtered by user."""
5328 if username:
5329 user = git_store.get_user(username)
5330 if not user:
5331 print_error(f"User '{username}' not found")
5332 raise typer.Exit(1)
5333
5334 if not user.feeds:
5335 print_info(f"No feeds configured for user '{username}'")
5336 return
5337
5338 print_feeds_table_from_git(git_store, username)
5339
5340
5341def list_entries(git_store: GitStore, username: Optional[str] = None, limit: Optional[int] = None) -> None:
5342 """List entries, optionally filtered by user."""
5343
5344 if username:
5345 # List entries for specific user
5346 user = git_store.get_user(username)
5347 if not user:
5348 print_error(f"User '{username}' not found")
5349 raise typer.Exit(1)
5350
5351 entries = git_store.list_entries(username, limit)
5352 if not entries:
5353 print_info(f"No entries found for user '{username}'")
5354 return
5355
5356 print_entries_table([entries], [username])
5357
5358 else:
5359 # List entries for all users
5360 all_entries = []
5361 all_usernames = []
5362
5363 index = git_store._load_index()
5364 for user in index.users.values():
5365 entries = git_store.list_entries(user.username, limit)
5366 if entries:
5367 all_entries.append(entries)
5368 all_usernames.append(user.username)
5369
5370 if not all_entries:
5371 print_info("No entries found")
5372 return
5373
5374 print_entries_table(all_entries, all_usernames)
5375
5376
5377def _clean_html_content(content: Optional[str]) -> str:
5378 """Clean HTML content for display in table."""
5379 if not content:
5380 return ""
5381
5382 # Remove HTML tags
5383 clean_text = re.sub(r'<[^>]+>', ' ', content)
5384 # Replace multiple whitespace with single space
5385 clean_text = re.sub(r'\s+', ' ', clean_text)
5386 # Strip and limit length
5387 clean_text = clean_text.strip()
5388 if len(clean_text) > 100:
5389 clean_text = clean_text[:97] + "..."
5390
5391 return clean_text
5392
5393
5394def print_entries_table(entries_by_user: list[list], usernames: list[str]) -> None:
5395 """Print a table of entries."""
5396 if get_tsv_mode():
5397 print_entries_tsv(entries_by_user, usernames)
5398 return
5399
5400 table = Table(title="Feed Entries")
5401 table.add_column("User", style="cyan", no_wrap=True)
5402 table.add_column("Title", style="bold")
5403 table.add_column("Updated", style="blue")
5404 table.add_column("URL", style="green")
5405
5406 # Combine all entries with usernames
5407 all_entries = []
5408 for entries, username in zip(entries_by_user, usernames):
5409 for entry in entries:
5410 all_entries.append((username, entry))
5411
5412 # Sort by updated time (newest first)
5413 all_entries.sort(key=lambda x: x[1].updated, reverse=True)
5414
5415 for username, entry in all_entries:
5416 # Format updated time
5417 updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
5418
5419 # Truncate title if too long
5420 title = entry.title
5421 if len(title) > 50:
5422 title = title[:47] + "..."
5423
5424 table.add_row(
5425 username,
5426 title,
5427 updated_str,
5428 str(entry.link),
5429 )
5430
5431 console.print(table)
5432</file>
5433
5434<file path="src/thicket/cli/main.py">
5435"""Main CLI application using Typer."""
5436
5437import typer
5438from rich.console import Console
5439
5440from .. import __version__
5441
5442app = typer.Typer(
5443 name="thicket",
5444 help="A CLI tool for persisting Atom/RSS feeds in Git repositories",
5445 no_args_is_help=True,
5446 rich_markup_mode="rich",
5447)
5448
5449console = Console()
5450
5451# Global state for TSV output mode
5452tsv_mode = False
5453
5454
5455def version_callback(value: bool) -> None:
5456 """Show version and exit."""
5457 if value:
5458 console.print(f"thicket version {__version__}")
5459 raise typer.Exit()
5460
5461
5462@app.callback()
5463def main(
5464 version: bool = typer.Option(
5465 None,
5466 "--version",
5467 "-v",
5468 help="Show the version and exit",
5469 callback=version_callback,
5470 is_eager=True,
5471 ),
5472 tsv: bool = typer.Option(
5473 False,
5474 "--tsv",
5475 help="Output in tab-separated values format without truncation",
5476 ),
5477) -> None:
5478 """Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
5479 global tsv_mode
5480 tsv_mode = tsv
5481
5482
5483# Import commands to register them
5484from .commands import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
5485
5486if __name__ == "__main__":
5487 app()
5488</file>
5489
5490<file path="src/thicket/core/git_store.py">
5491"""Git repository operations for thicket."""
5492
5493import json
5494from datetime import datetime
5495from pathlib import Path
5496from typing import Optional
5497
5498import git
5499from git import Repo
5500
5501from ..models import AtomEntry, DuplicateMap, GitStoreIndex, UserMetadata
5502
5503
5504class GitStore:
5505 """Manages the Git repository for storing feed entries."""
5506
5507 def __init__(self, repo_path: Path):
5508 """Initialize the Git store."""
5509 self.repo_path = repo_path
5510 self.repo: Optional[Repo] = None
5511 self._ensure_repo()
5512
5513 def _ensure_repo(self) -> None:
5514 """Ensure the Git repository exists and is initialized."""
5515 if not self.repo_path.exists():
5516 self.repo_path.mkdir(parents=True, exist_ok=True)
5517
5518 try:
5519 self.repo = Repo(self.repo_path)
5520 except git.InvalidGitRepositoryError:
5521 # Initialize new repository
5522 self.repo = Repo.init(self.repo_path)
5523 self._create_initial_structure()
5524
5525 def _create_initial_structure(self) -> None:
5526 """Create initial Git store structure."""
5527 # Create index.json
5528 index = GitStoreIndex(
5529 created=datetime.now(),
5530 last_updated=datetime.now(),
5531 )
5532 self._save_index(index)
5533
5534 # Create duplicates.json
5535 duplicates = DuplicateMap()
5536 self._save_duplicates(duplicates)
5537
5538 # Create initial commit
5539 self.repo.index.add(["index.json", "duplicates.json"])
5540 self.repo.index.commit("Initial thicket repository structure")
5541
5542 def _save_index(self, index: GitStoreIndex) -> None:
5543 """Save the index to index.json."""
5544 index_path = self.repo_path / "index.json"
5545 with open(index_path, "w") as f:
5546 json.dump(index.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
5547
5548 def _load_index(self) -> GitStoreIndex:
5549 """Load the index from index.json."""
5550 index_path = self.repo_path / "index.json"
5551 if not index_path.exists():
5552 return GitStoreIndex(
5553 created=datetime.now(),
5554 last_updated=datetime.now(),
5555 )
5556
5557 with open(index_path) as f:
5558 data = json.load(f)
5559
5560 return GitStoreIndex(**data)
5561
5562 def _save_duplicates(self, duplicates: DuplicateMap) -> None:
5563 """Save duplicates map to duplicates.json."""
5564 duplicates_path = self.repo_path / "duplicates.json"
5565 with open(duplicates_path, "w") as f:
5566 json.dump(duplicates.model_dump(exclude_none=True), f, indent=2)
5567
5568 def _load_duplicates(self) -> DuplicateMap:
5569 """Load duplicates map from duplicates.json."""
5570 duplicates_path = self.repo_path / "duplicates.json"
5571 if not duplicates_path.exists():
5572 return DuplicateMap()
5573
5574 with open(duplicates_path) as f:
5575 data = json.load(f)
5576
5577 return DuplicateMap(**data)
5578
5579 def add_user(self, username: str, display_name: Optional[str] = None,
5580 email: Optional[str] = None, homepage: Optional[str] = None,
5581 icon: Optional[str] = None, feeds: Optional[list[str]] = None) -> UserMetadata:
5582 """Add a new user to the Git store."""
5583 index = self._load_index()
5584
5585 # Create user directory
5586 user_dir = self.repo_path / username
5587 user_dir.mkdir(exist_ok=True)
5588
5589 # Create user metadata
5590 user_metadata = UserMetadata(
5591 username=username,
5592 display_name=display_name,
5593 email=email,
5594 homepage=homepage,
5595 icon=icon,
5596 feeds=feeds or [],
5597 directory=username,
5598 created=datetime.now(),
5599 last_updated=datetime.now(),
5600 )
5601
5602
5603 # Update index
5604 index.add_user(user_metadata)
5605 self._save_index(index)
5606
5607 return user_metadata
5608
5609 def get_user(self, username: str) -> Optional[UserMetadata]:
5610 """Get user metadata by username."""
5611 index = self._load_index()
5612 return index.get_user(username)
5613
5614 def update_user(self, username: str, **kwargs) -> bool:
5615 """Update user metadata."""
5616 index = self._load_index()
5617 user = index.get_user(username)
5618
5619 if not user:
5620 return False
5621
5622 # Update user metadata
5623 for key, value in kwargs.items():
5624 if hasattr(user, key) and value is not None:
5625 setattr(user, key, value)
5626
5627 user.update_timestamp()
5628
5629
5630 # Update index
5631 index.add_user(user)
5632 self._save_index(index)
5633
5634 return True
5635
5636 def store_entry(self, username: str, entry: AtomEntry) -> bool:
5637 """Store an entry in the user's directory."""
5638 user = self.get_user(username)
5639 if not user:
5640 return False
5641
5642 # Sanitize entry ID for filename
5643 from .feed_parser import FeedParser
5644 parser = FeedParser()
5645 safe_id = parser.sanitize_entry_id(entry.id)
5646
5647 # Create entry file
5648 user_dir = self.repo_path / user.directory
5649 entry_path = user_dir / f"{safe_id}.json"
5650
5651 # Check if entry already exists
5652 entry_exists = entry_path.exists()
5653
5654 # Save entry
5655 with open(entry_path, "w") as f:
5656 json.dump(entry.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
5657
5658 # Update user metadata if new entry
5659 if not entry_exists:
5660 index = self._load_index()
5661 index.update_entry_count(username, 1)
5662 self._save_index(index)
5663
5664 return True
5665
5666 def get_entry(self, username: str, entry_id: str) -> Optional[AtomEntry]:
5667 """Get an entry by username and entry ID."""
5668 user = self.get_user(username)
5669 if not user:
5670 return None
5671
5672 # Sanitize entry ID
5673 from .feed_parser import FeedParser
5674 parser = FeedParser()
5675 safe_id = parser.sanitize_entry_id(entry_id)
5676
5677 entry_path = self.repo_path / user.directory / f"{safe_id}.json"
5678 if not entry_path.exists():
5679 return None
5680
5681 with open(entry_path) as f:
5682 data = json.load(f)
5683
5684 return AtomEntry(**data)
5685
5686 def list_entries(self, username: str, limit: Optional[int] = None) -> list[AtomEntry]:
5687 """List entries for a user."""
5688 user = self.get_user(username)
5689 if not user:
5690 return []
5691
5692 user_dir = self.repo_path / user.directory
5693 if not user_dir.exists():
5694 return []
5695
5696 entries = []
5697 entry_files = sorted(user_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
5698
5699
5700 if limit:
5701 entry_files = entry_files[:limit]
5702
5703 for entry_file in entry_files:
5704 try:
5705 with open(entry_file) as f:
5706 data = json.load(f)
5707 entries.append(AtomEntry(**data))
5708 except Exception:
5709 # Skip invalid entries
5710 continue
5711
5712 return entries
5713
5714 def get_duplicates(self) -> DuplicateMap:
5715 """Get the duplicates map."""
5716 return self._load_duplicates()
5717
5718 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
5719 """Add a duplicate mapping."""
5720 duplicates = self._load_duplicates()
5721 duplicates.add_duplicate(duplicate_id, canonical_id)
5722 self._save_duplicates(duplicates)
5723
5724 def remove_duplicate(self, duplicate_id: str) -> bool:
5725 """Remove a duplicate mapping."""
5726 duplicates = self._load_duplicates()
5727 result = duplicates.remove_duplicate(duplicate_id)
5728 self._save_duplicates(duplicates)
5729 return result
5730
5731 def commit_changes(self, message: str) -> None:
5732 """Commit all changes to the Git repository."""
5733 if not self.repo:
5734 return
5735
5736 # Add all changes
5737 self.repo.git.add(A=True)
5738
5739 # Check if there are changes to commit
5740 if self.repo.index.diff("HEAD"):
5741 self.repo.index.commit(message)
5742
5743 def get_stats(self) -> dict:
5744 """Get statistics about the Git store."""
5745 index = self._load_index()
5746 duplicates = self._load_duplicates()
5747
5748 return {
5749 "total_users": len(index.users),
5750 "total_entries": index.total_entries,
5751 "total_duplicates": len(duplicates.duplicates),
5752 "last_updated": index.last_updated,
5753 "repository_size": sum(f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()),
5754 }
5755
5756 def search_entries(self, query: str, username: Optional[str] = None,
5757 limit: Optional[int] = None) -> list[tuple[str, AtomEntry]]:
5758 """Search entries by content."""
5759 results = []
5760
5761 # Get users to search
5762 index = self._load_index()
5763 users = [index.get_user(username)] if username else list(index.users.values())
5764 users = [u for u in users if u is not None]
5765
5766 for user in users:
5767 user_dir = self.repo_path / user.directory
5768 if not user_dir.exists():
5769 continue
5770
5771 entry_files = user_dir.glob("*.json")
5772
5773 for entry_file in entry_files:
5774 try:
5775 with open(entry_file) as f:
5776 data = json.load(f)
5777
5778 entry = AtomEntry(**data)
5779
5780 # Simple text search in title, summary, and content
5781 searchable_text = " ".join(filter(None, [
5782 entry.title,
5783 entry.summary or "",
5784 entry.content or "",
5785 ])).lower()
5786
5787 if query.lower() in searchable_text:
5788 results.append((user.username, entry))
5789
5790 if limit and len(results) >= limit:
5791 return results
5792
5793 except Exception:
5794 # Skip invalid entries
5795 continue
5796
5797 # Sort by updated time (newest first)
5798 results.sort(key=lambda x: x[1].updated, reverse=True)
5799
5800 return results[:limit] if limit else results
5801</file>
5802
5803<file path="ARCH.md">
5804# Thicket Architecture Design
5805
5806## Overview
5807Thicket is a modern CLI tool for persisting Atom/RSS feeds in a Git repository, designed to enable distributed webblog comment structures.
5808
5809## Technology Stack
5810
5811### Core Libraries
5812
5813#### CLI Framework
5814- **Typer** (0.15.x) - Modern CLI framework with type hints
5815- **Rich** (13.x) - Beautiful terminal output, progress bars, and tables
5816- **prompt-toolkit** - Interactive prompts when needed
5817
5818#### Feed Processing
5819- **feedparser** (6.0.11) - Universal feed parser supporting RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0
5820 - Alternative: **atoma** for stricter Atom/RSS parsing with JSON feed support
5821 - Alternative: **fastfeedparser** for high-performance parsing (10x faster)
5822
5823#### Git Integration
5824- **GitPython** (3.1.44) - High-level git operations, requires git CLI
5825 - Alternative: **pygit2** (1.18.0) - Direct libgit2 bindings, better for authentication
5826
5827#### HTTP Client
5828- **httpx** (0.28.x) - Modern async/sync HTTP client with connection pooling
5829- **aiohttp** (3.11.x) - For async-only operations if needed
5830
5831#### Configuration & Data Models
5832- **pydantic** (2.11.x) - Data validation and settings management
5833- **pydantic-settings** (2.10.x) - Configuration file handling with env var support
5834
5835#### Utilities
5836- **pendulum** (3.x) - Better datetime handling
5837- **bleach** (6.x) - HTML sanitization for feed content
5838- **platformdirs** (4.x) - Cross-platform directory paths
5839
5840## Project Structure
5841
5842```
5843thicket/
5844├── pyproject.toml # Modern Python packaging
5845├── README.md # Project documentation
5846├── ARCH.md # This file
5847├── CLAUDE.md # Project instructions
5848├── .gitignore
5849├── src/
5850│ └── thicket/
5851│ ├── __init__.py
5852│ ├── __main__.py # Entry point for `python -m thicket`
5853│ ├── cli/ # CLI commands and interface
5854│ │ ├── __init__.py
5855│ │ ├── main.py # Main CLI app with Typer
5856│ │ ├── commands/ # Subcommands
5857│ │ │ ├── __init__.py
5858│ │ │ ├── init.py # Initialize git store
5859│ │ │ ├── add.py # Add users and feeds
5860│ │ │ ├── sync.py # Sync feeds
5861│ │ │ ├── list_cmd.py # List users/feeds
5862│ │ │ ├── duplicates.py # Manage duplicate entries
5863│ │ │ ├── links_cmd.py # Extract and categorize links
5864│ │ │ └── index_cmd.py # Build reference index and show threads
5865│ │ └── utils.py # CLI utilities (progress, formatting)
5866│ ├── core/ # Core business logic
5867│ │ ├── __init__.py
5868│ │ ├── feed_parser.py # Feed parsing and normalization
5869│ │ ├── git_store.py # Git repository operations
5870│ │ └── reference_parser.py # Link extraction and threading
5871│ ├── models/ # Pydantic data models
5872│ │ ├── __init__.py
5873│ │ ├── config.py # Configuration models
5874│ │ ├── feed.py # Feed/Entry models
5875│ │ └── user.py # User metadata models
5876│ └── utils/ # Shared utilities
5877│ └── __init__.py
5878├── tests/
5879│ ├── __init__.py
5880│ ├── conftest.py # pytest configuration
5881│ ├── test_feed_parser.py
5882│ ├── test_git_store.py
5883│ └── fixtures/ # Test data
5884│ └── feeds/
5885└── docs/
5886 └── examples/ # Example configurations
5887```
5888
5889## Data Models
5890
5891### Configuration File (YAML/TOML)
5892```python
5893class ThicketConfig(BaseSettings):
5894 git_store: Path # Git repository location
5895 cache_dir: Path # Cache directory
5896 users: list[UserConfig]
5897
5898 model_config = SettingsConfigDict(
5899 env_prefix="THICKET_",
5900 env_file=".env",
5901 yaml_file="thicket.yaml"
5902 )
5903
5904class UserConfig(BaseModel):
5905 username: str
5906 feeds: list[HttpUrl]
5907 email: Optional[EmailStr] = None
5908 homepage: Optional[HttpUrl] = None
5909 icon: Optional[HttpUrl] = None
5910 display_name: Optional[str] = None
5911```
5912
5913### Feed Storage Format
5914```python
5915class AtomEntry(BaseModel):
5916 id: str # Original Atom ID
5917 title: str
5918 link: HttpUrl
5919 updated: datetime
5920 published: Optional[datetime]
5921 summary: Optional[str]
5922 content: Optional[str] # Full body content from Atom entry
5923 content_type: Optional[str] = "html" # text, html, xhtml
5924 author: Optional[dict]
5925 categories: list[str] = []
5926 rights: Optional[str] = None # Copyright info
5927 source: Optional[str] = None # Source feed URL
5928 # Additional Atom fields preserved during RSS->Atom conversion
5929
5930 model_config = ConfigDict(
5931 json_encoders={
5932 datetime: lambda v: v.isoformat()
5933 }
5934 )
5935
5936class DuplicateMap(BaseModel):
5937 """Maps duplicate entry IDs to canonical entry IDs"""
5938 duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
5939 comment: str = "Entry IDs that map to the same canonical content"
5940
5941 def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
5942 """Add a duplicate mapping"""
5943 self.duplicates[duplicate_id] = canonical_id
5944
5945 def remove_duplicate(self, duplicate_id: str) -> bool:
5946 """Remove a duplicate mapping. Returns True if existed."""
5947 return self.duplicates.pop(duplicate_id, None) is not None
5948
5949 def get_canonical(self, entry_id: str) -> str:
5950 """Get canonical ID for an entry (returns original if not duplicate)"""
5951 return self.duplicates.get(entry_id, entry_id)
5952
5953 def is_duplicate(self, entry_id: str) -> bool:
5954 """Check if entry ID is marked as duplicate"""
5955 return entry_id in self.duplicates
5956```
5957
5958## Git Repository Structure
5959```
5960git-store/
5961├── index.json # User directory index
5962├── duplicates.json # Manual curation of duplicate entries
5963├── links.json # Unified links, references, and mapping data
5964├── user1/
5965│ ├── entry_id_1.json # Sanitized entry files
5966│ ├── entry_id_2.json
5967│ └── ...
5968└── user2/
5969 └── ...
5970```
5971
5972## Key Design Decisions
5973
5974### 1. Feed Normalization & Auto-Discovery
5975- All RSS feeds converted to Atom format before storage
5976- Preserves maximum metadata during conversion
5977- Sanitizes HTML content to prevent XSS
5978- **Auto-discovery**: Extracts user metadata from feed during `add user` command
5979
5980### 2. ID Sanitization
5981- Consistent algorithm to convert Atom IDs to safe filenames
5982- Handles edge cases (very long IDs, special characters)
5983- Maintains reversibility where possible
5984
5985### 3. Git Operations
5986- Uses GitPython for simplicity (no authentication required)
5987- Single main branch for all users and entries
5988- Atomic commits per sync operation
5989- Meaningful commit messages with feed update summaries
5990- Preserves complete history - never delete entries even if they disappear from feeds
5991
5992### 4. Caching Strategy
5993- HTTP caching with Last-Modified/ETag support
5994- Local cache of parsed feeds with TTL
5995- Cache invalidation on configuration changes
5996- Git store serves as permanent historical archive beyond feed depth limits
5997
5998### 5. Error Handling
5999- Graceful handling of feed parsing errors
6000- Retry logic for network failures
6001- Clear error messages with recovery suggestions
6002
6003## CLI Command Structure
6004
6005```bash
6006# Initialize a new git store
6007thicket init /path/to/store
6008
6009# Add a user with feeds (auto-discovers metadata from feed)
6010thicket add user "alyssa" \
6011 --feed "https://example.com/feed.atom"
6012 # Auto-populates: email, homepage, icon, display_name from feed metadata
6013
6014# Add a user with manual overrides
6015thicket add user "alyssa" \
6016 --feed "https://example.com/feed.atom" \
6017 --email "alyssa@example.com" \
6018 --homepage "https://alyssa.example.com" \
6019 --icon "https://example.com/avatar.png" \
6020 --display-name "Alyssa P. Hacker"
6021
6022# Add additional feed to existing user
6023thicket add feed "alyssa" "https://example.com/other-feed.rss"
6024
6025# Sync all feeds (designed for cron usage)
6026thicket sync --all
6027
6028# Sync specific user
6029thicket sync --user alyssa
6030
6031# List users and their feeds
6032thicket list users
6033thicket list feeds --user alyssa
6034
6035# Manage duplicate entries
6036thicket duplicates list
6037thicket duplicates add <entry_id_1> <entry_id_2> # Mark as duplicates
6038thicket duplicates remove <entry_id_1> <entry_id_2> # Unmark duplicates
6039
6040# Link processing and threading
6041thicket links --verbose # Extract and categorize all links
6042thicket index --verbose # Build reference index for threading
6043thicket threads # Show conversation threads
6044thicket threads --username user1 # Show threads for specific user
6045thicket threads --min-size 3 # Show threads with minimum size
6046```
6047
6048## Performance Considerations
6049
60501. **Concurrent Feed Fetching**: Use httpx with asyncio for parallel downloads
60512. **Incremental Updates**: Only fetch/parse feeds that have changed
60523. **Efficient Git Operations**: Batch commits, use shallow clones where appropriate
60534. **Progress Feedback**: Rich progress bars for long operations
6054
6055## Security Considerations
6056
60571. **HTML Sanitization**: Use bleach to clean feed content
60582. **URL Validation**: Strict validation of feed URLs
60593. **Git Security**: No credentials stored in repository
60604. **Path Traversal**: Careful sanitization of filenames
6061
6062## Future Enhancements
6063
60641. **Web Interface**: Optional web UI for browsing the git store
60652. **Webhooks**: Notify external services on feed updates
60663. **Feed Discovery**: Auto-discover feeds from HTML pages
60674. **Export Formats**: Generate static sites, OPML exports
60685. **Federation**: P2P sync between thicket instances
6069
6070## Requirements Clarification
6071
6072**✓ Resolved Requirements:**
60731. **Feed Update Frequency**: Designed for cron usage - no built-in scheduling needed
60742. **Duplicate Handling**: Manual curation via `duplicates.json` file with CLI commands
60753. **Git Branching**: Single main branch for all users and entries
60764. **Authentication**: No feeds require authentication currently
60775. **Content Storage**: Store complete Atom entry body content as provided
60786. **Deleted Entries**: Preserve all entries in Git store permanently (historical archive)
60797. **History Depth**: Git store maintains full history beyond feed depth limits
60808. **Feed Auto-Discovery**: Extract user metadata from feed during `add user` command
6081
6082## Duplicate Entry Management
6083
6084### Duplicate Detection Strategy
6085- **Manual Curation**: Duplicates identified and managed manually via CLI
6086- **Storage**: `duplicates.json` file in Git root maps entry IDs to canonical entries
6087- **Structure**: `{"duplicate_id": "canonical_id", ...}`
6088- **CLI Commands**: Add/remove duplicate mappings with validation
6089- **Query Resolution**: Search/list commands resolve duplicates to canonical entries
6090
6091### Duplicate File Format
6092```json
6093{
6094 "https://example.com/feed/entry/123": "https://canonical.com/posts/same-post",
6095 "https://mirror.com/articles/456": "https://canonical.com/posts/same-post",
6096 "comment": "Entry IDs that map to the same canonical content"
6097}
6098```
6099
6100## Feed Metadata Auto-Discovery
6101
6102### Extraction Strategy
6103When adding a new user with `thicket add user`, the system fetches and parses the feed to extract:
6104
6105- **Display Name**: From `feed.title` or `feed.author.name`
6106- **Email**: From `feed.author.email` or `feed.managingEditor`
6107- **Homepage**: From `feed.link` or `feed.author.uri`
6108- **Icon**: From `feed.logo`, `feed.icon`, or `feed.image.url`
6109
6110### Discovery Priority Order
61111. **Author Information**: Prefer `feed.author.*` fields (more specific to person)
61122. **Feed-Level**: Fall back to feed-level metadata
61133. **Manual Override**: CLI flags always take precedence over discovered values
61144. **Update Behavior**: Auto-discovery only runs during initial `add user`, not on sync
6115
6116### Extracted Metadata Format
6117```python
6118class FeedMetadata(BaseModel):
6119 title: Optional[str] = None
6120 author_name: Optional[str] = None
6121 author_email: Optional[EmailStr] = None
6122 author_uri: Optional[HttpUrl] = None
6123 link: Optional[HttpUrl] = None
6124 logo: Optional[HttpUrl] = None
6125 icon: Optional[HttpUrl] = None
6126 image_url: Optional[HttpUrl] = None
6127
6128 def to_user_config(self, username: str, feed_url: HttpUrl) -> UserConfig:
6129 """Convert discovered metadata to UserConfig with fallbacks"""
6130 return UserConfig(
6131 username=username,
6132 feeds=[feed_url],
6133 display_name=self.author_name or self.title,
6134 email=self.author_email,
6135 homepage=self.author_uri or self.link,
6136 icon=self.logo or self.icon or self.image_url
6137 )
6138```
6139
6140## Link Processing and Threading Architecture
6141
6142### Overview
6143The thicket system implements a sophisticated link processing and threading system to create email-style threaded views of blog entries by tracking cross-references between different blogs.
6144
6145### Link Processing Pipeline
6146
6147#### 1. Link Extraction (`thicket links`)
6148The `links` command systematically extracts all outbound links from blog entries and categorizes them:
6149
6150```python
6151class LinkData(BaseModel):
6152 url: str # Fully resolved URL
6153 entry_id: str # Source entry ID
6154 username: str # Source username
6155 context: str # Surrounding text context
6156 category: str # "internal", "user", or "unknown"
6157 target_username: Optional[str] # Target user if applicable
6158```
6159
6160**Link Categories:**
6161- **Internal**: Links to the same user's domain (self-references)
6162- **User**: Links to other tracked users' domains
6163- **Unknown**: Links to external sites not tracked by thicket
6164
6165#### 2. URL Resolution
6166All links are properly resolved using the Atom feed's base URL to handle:
6167- Relative URLs (converted to absolute)
6168- Protocol-relative URLs
6169- Fragment identifiers
6170- Redirects and canonical URLs
6171
6172#### 3. Domain Mapping
6173The system builds a comprehensive domain mapping from user configuration:
6174- Feed URLs → domain extraction
6175- Homepage URLs → domain extraction
6176- Reverse mapping: domain → username
6177
6178### Threading System
6179
6180#### 1. Reference Index Generation (`thicket index`)
6181Creates a bidirectional reference index from the categorized links:
6182
6183```python
6184class BlogReference(BaseModel):
6185 source_entry_id: str
6186 source_username: str
6187 target_url: str
6188 target_username: Optional[str]
6189 target_entry_id: Optional[str]
6190 context: str
6191```
6192
6193#### 2. Thread Detection Algorithm
6194Uses graph traversal to find connected blog entries:
6195- **Outbound references**: Links from an entry to other entries
6196- **Inbound references**: Links to an entry from other entries
6197- **Thread members**: All entries connected through references
6198
6199#### 3. Threading Display (`thicket threads`)
6200Creates email-style threaded views:
6201- Chronological ordering within threads
6202- Reference counts (outbound/inbound)
6203- Context preservation
6204- Filtering options (user, entry, minimum size)
6205
6206### Data Structures
6207
6208#### links.json Format (Unified Structure)
6209```json
6210{
6211 "links": {
6212 "https://example.com/post/123": {
6213 "referencing_entries": ["https://blog.user.com/entry/456"],
6214 "target_username": "user2"
6215 },
6216 "https://external-site.com/article": {
6217 "referencing_entries": ["https://blog.user.com/entry/789"]
6218 }
6219 },
6220 "reverse_mapping": {
6221 "https://blog.user.com/entry/456": ["https://example.com/post/123"],
6222 "https://blog.user.com/entry/789": ["https://external-site.com/article"]
6223 },
6224 "references": [
6225 {
6226 "source_entry_id": "https://blog.user.com/entry/456",
6227 "source_username": "user1",
6228 "target_url": "https://example.com/post/123",
6229 "target_username": "user2",
6230 "target_entry_id": "https://example.com/post/123",
6231 "context": "As mentioned in this post..."
6232 }
6233 ],
6234 "user_domains": {
6235 "user1": ["blog.user.com"],
6236 "user2": ["example.com"]
6237 }
6238}
6239```
6240
6241This unified structure eliminates duplication by:
6242- Storing each URL only once with minimal metadata
6243- Including all link data, reference data, and mappings in one file
6244- Using presence of `target_username` to identify tracked vs external links
6245- Providing bidirectional mappings for efficient queries
6246
6247### Unified Structure Benefits
6248
6249- **Eliminates Duplication**: Each URL appears only once with metadata
6250- **Single Source of Truth**: All link-related data in one file
6251- **Efficient Queries**: Fast lookups for both directions (URL→entries, entry→URLs)
6252- **Atomic Updates**: All link data changes together
6253- **Reduced I/O**: Fewer file operations
6254
6255### Implementation Benefits
6256
62571. **Systematic Link Processing**: All links are extracted and categorized consistently
62582. **Proper URL Resolution**: Handles relative URLs and base URL resolution correctly
62593. **Domain-based Categorization**: Automatically identifies user-to-user references
62604. **Bidirectional Indexing**: Supports both "who links to whom" and "who is linked by whom"
62615. **Thread Discovery**: Finds conversation threads automatically
62626. **Rich Context**: Preserves surrounding text for each link
62637. **Performance**: Pre-computed indexes for fast threading queries
6264
6265### CLI Commands
6266
6267```bash
6268# Extract and categorize all links
6269thicket links --verbose
6270
6271# Build reference index for threading
6272thicket index --verbose
6273
6274# Show all conversation threads
6275thicket threads
6276
6277# Show threads for specific user
6278thicket threads --username user1
6279
6280# Show threads with minimum size
6281thicket threads --min-size 3
6282```
6283
6284### Integration with Existing Commands
6285
6286The link processing system integrates seamlessly with existing thicket commands:
6287- `thicket sync` updates entries, requiring `thicket links` to be run afterward
6288- `thicket index` uses the output from `thicket links` for improved accuracy
6289- `thicket threads` provides the user-facing threading interface
6290
6291## Current Implementation Status
6292
6293### ✅ Completed Features
62941. **Core Infrastructure**
6295 - Modern CLI with Typer and Rich
6296 - Pydantic data models for type safety
6297 - Git repository operations with GitPython
6298 - Feed parsing and normalization with feedparser
6299
63002. **User and Feed Management**
6301 - `thicket init` - Initialize git store
6302 - `thicket add` - Add users and feeds with auto-discovery
6303 - `thicket sync` - Sync feeds with progress tracking
6304 - `thicket list` - List users, feeds, and entries
6305 - `thicket duplicates` - Manage duplicate entries
6306
63073. **Link Processing and Threading**
6308 - `thicket links` - Extract and categorize all outbound links
6309 - `thicket index` - Build reference index from links
6310 - `thicket threads` - Display threaded conversation views
6311 - Proper URL resolution with base URL handling
6312 - Domain-based link categorization
6313 - Context preservation for links
6314
6315### 📊 System Performance
6316- **Link Extraction**: Successfully processes thousands of blog entries
6317- **Categorization**: Identifies internal, user, and unknown links
6318- **Threading**: Creates email-style threaded views of conversations
6319- **Storage**: Efficient JSON-based data structures for links and references
6320
6321### 🔧 Current Architecture Highlights
6322- **Modular Design**: Clear separation between CLI, core logic, and models
6323- **Type Safety**: Comprehensive Pydantic models for data validation
6324- **Rich CLI**: Beautiful progress bars, tables, and error handling
6325- **Extensible**: Easy to add new commands and features
6326- **Git Integration**: All data stored in version-controlled JSON files
6327
6328### 🎯 Proven Functionality
6329The system has been tested with real blog data and successfully:
6330- Extracted 14,396 total links from blog entries
6331- Categorized 3,994 internal links, 363 user-to-user links, and 10,039 unknown links
6332- Built comprehensive domain mappings for 16 users across 20 domains
6333- Generated threaded views showing blog conversation patterns
6334
6335### 🚀 Ready for Use
6336The thicket system is now fully functional for:
6337- Maintaining Git repositories of blog feeds
6338- Tracking cross-references between blogs
6339- Creating threaded views of blog conversations
6340- Discovering blog interaction patterns
6341- Building distributed comment systems
6342</file>
6343
6344<file path="src/thicket/cli/utils.py">
6345"""CLI utilities and helpers."""
6346
6347from pathlib import Path
6348from typing import Optional
6349
6350import typer
6351from rich.console import Console
6352from rich.progress import Progress, SpinnerColumn, TextColumn
6353from rich.table import Table
6354
6355from ..models import ThicketConfig, UserMetadata
6356from ..core.git_store import GitStore
6357
6358console = Console()
6359
6360
6361def get_tsv_mode() -> bool:
6362 """Get the global TSV mode setting."""
6363 from .main import tsv_mode
6364 return tsv_mode
6365
6366
6367def load_config(config_path: Optional[Path] = None) -> ThicketConfig:
6368 """Load thicket configuration from file or environment."""
6369 if config_path and config_path.exists():
6370 import yaml
6371
6372 with open(config_path) as f:
6373 config_data = yaml.safe_load(f)
6374
6375 # Convert to ThicketConfig
6376 return ThicketConfig(**config_data)
6377
6378 # Try to load from default locations or environment
6379 try:
6380 # First try to find thicket.yaml in current directory
6381 default_config = Path("thicket.yaml")
6382 if default_config.exists():
6383 import yaml
6384 with open(default_config) as f:
6385 config_data = yaml.safe_load(f)
6386 return ThicketConfig(**config_data)
6387
6388 # Fall back to environment variables
6389 return ThicketConfig()
6390 except Exception as e:
6391 console.print(f"[red]Error loading configuration: {e}[/red]")
6392 console.print("[yellow]Run 'thicket init' to create a new configuration.[/yellow]")
6393 raise typer.Exit(1) from e
6394
6395
6396def save_config(config: ThicketConfig, config_path: Path) -> None:
6397 """Save thicket configuration to file."""
6398 import yaml
6399
6400 config_data = config.model_dump(mode="json", exclude_none=True)
6401
6402 # Convert Path objects to strings for YAML serialization
6403 config_data["git_store"] = str(config_data["git_store"])
6404 config_data["cache_dir"] = str(config_data["cache_dir"])
6405
6406 with open(config_path, "w") as f:
6407 yaml.dump(config_data, f, default_flow_style=False, sort_keys=False)
6408
6409
6410def create_progress() -> Progress:
6411 """Create a Rich progress display."""
6412 return Progress(
6413 SpinnerColumn(),
6414 TextColumn("[progress.description]{task.description}"),
6415 console=console,
6416 transient=True,
6417 )
6418
6419
6420def print_users_table(config: ThicketConfig) -> None:
6421 """Print a table of users and their feeds."""
6422 if get_tsv_mode():
6423 print_users_tsv(config)
6424 return
6425
6426 table = Table(title="Users and Feeds")
6427 table.add_column("Username", style="cyan", no_wrap=True)
6428 table.add_column("Display Name", style="magenta")
6429 table.add_column("Email", style="blue")
6430 table.add_column("Homepage", style="green")
6431 table.add_column("Feeds", style="yellow")
6432
6433 for user in config.users:
6434 feeds_str = "\n".join(str(feed) for feed in user.feeds)
6435 table.add_row(
6436 user.username,
6437 user.display_name or "",
6438 user.email or "",
6439 str(user.homepage) if user.homepage else "",
6440 feeds_str,
6441 )
6442
6443 console.print(table)
6444
6445
6446def print_feeds_table(config: ThicketConfig, username: Optional[str] = None) -> None:
6447 """Print a table of feeds, optionally filtered by username."""
6448 if get_tsv_mode():
6449 print_feeds_tsv(config, username)
6450 return
6451
6452 table = Table(title=f"Feeds{f' for {username}' if username else ''}")
6453 table.add_column("Username", style="cyan", no_wrap=True)
6454 table.add_column("Feed URL", style="blue")
6455 table.add_column("Status", style="green")
6456
6457 users = [config.find_user(username)] if username else config.users
6458 users = [u for u in users if u is not None]
6459
6460 for user in users:
6461 for feed in user.feeds:
6462 table.add_row(
6463 user.username,
6464 str(feed),
6465 "Active", # TODO: Add actual status checking
6466 )
6467
6468 console.print(table)
6469
6470
6471def confirm_action(message: str, default: bool = False) -> bool:
6472 """Prompt for confirmation."""
6473 return typer.confirm(message, default=default)
6474
6475
6476def print_success(message: str) -> None:
6477 """Print a success message."""
6478 console.print(f"[green]✓[/green] {message}")
6479
6480
6481def print_error(message: str) -> None:
6482 """Print an error message."""
6483 console.print(f"[red]✗[/red] {message}")
6484
6485
6486def print_warning(message: str) -> None:
6487 """Print a warning message."""
6488 console.print(f"[yellow]⚠[/yellow] {message}")
6489
6490
6491def print_info(message: str) -> None:
6492 """Print an info message."""
6493 console.print(f"[blue]ℹ[/blue] {message}")
6494
6495
6496def print_users_table_from_git(users: list[UserMetadata]) -> None:
6497 """Print a table of users from git repository."""
6498 if get_tsv_mode():
6499 print_users_tsv_from_git(users)
6500 return
6501
6502 table = Table(title="Users and Feeds")
6503 table.add_column("Username", style="cyan", no_wrap=True)
6504 table.add_column("Display Name", style="magenta")
6505 table.add_column("Email", style="blue")
6506 table.add_column("Homepage", style="green")
6507 table.add_column("Feeds", style="yellow")
6508
6509 for user in users:
6510 feeds_str = "\n".join(user.feeds)
6511 table.add_row(
6512 user.username,
6513 user.display_name or "",
6514 user.email or "",
6515 user.homepage or "",
6516 feeds_str,
6517 )
6518
6519 console.print(table)
6520
6521
6522def print_feeds_table_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
6523 """Print a table of feeds from git repository."""
6524 if get_tsv_mode():
6525 print_feeds_tsv_from_git(git_store, username)
6526 return
6527
6528 table = Table(title=f"Feeds{f' for {username}' if username else ''}")
6529 table.add_column("Username", style="cyan", no_wrap=True)
6530 table.add_column("Feed URL", style="blue")
6531 table.add_column("Status", style="green")
6532
6533 if username:
6534 user = git_store.get_user(username)
6535 users = [user] if user else []
6536 else:
6537 index = git_store._load_index()
6538 users = list(index.users.values())
6539
6540 for user in users:
6541 for feed in user.feeds:
6542 table.add_row(
6543 user.username,
6544 feed,
6545 "Active", # TODO: Add actual status checking
6546 )
6547
6548 console.print(table)
6549
6550
6551def print_users_tsv(config: ThicketConfig) -> None:
6552 """Print users in TSV format."""
6553 print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
6554 for user in config.users:
6555 feeds_str = ",".join(str(feed) for feed in user.feeds)
6556 print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
6557
6558
6559def print_users_tsv_from_git(users: list[UserMetadata]) -> None:
6560 """Print users from git repository in TSV format."""
6561 print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
6562 for user in users:
6563 feeds_str = ",".join(user.feeds)
6564 print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
6565
6566
6567def print_feeds_tsv(config: ThicketConfig, username: Optional[str] = None) -> None:
6568 """Print feeds in TSV format."""
6569 print("Username\tFeed URL\tStatus")
6570 users = [config.find_user(username)] if username else config.users
6571 users = [u for u in users if u is not None]
6572
6573 for user in users:
6574 for feed in user.feeds:
6575 print(f"{user.username}\t{feed}\tActive")
6576
6577
6578def print_feeds_tsv_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
6579 """Print feeds from git repository in TSV format."""
6580 print("Username\tFeed URL\tStatus")
6581
6582 if username:
6583 user = git_store.get_user(username)
6584 users = [user] if user else []
6585 else:
6586 index = git_store._load_index()
6587 users = list(index.users.values())
6588
6589 for user in users:
6590 for feed in user.feeds:
6591 print(f"{user.username}\t{feed}\tActive")
6592
6593
6594def print_entries_tsv(entries_by_user: list[list], usernames: list[str]) -> None:
6595 """Print entries in TSV format."""
6596 print("User\tAtom ID\tTitle\tUpdated\tURL")
6597
6598 # Combine all entries with usernames
6599 all_entries = []
6600 for entries, username in zip(entries_by_user, usernames):
6601 for entry in entries:
6602 all_entries.append((username, entry))
6603
6604 # Sort by updated time (newest first)
6605 all_entries.sort(key=lambda x: x[1].updated, reverse=True)
6606
6607 for username, entry in all_entries:
6608 # Format updated time
6609 updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
6610
6611 # Escape tabs and newlines in title to preserve TSV format
6612 title = entry.title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
6613
6614 print(f"{username}\t{entry.id}\t{title}\t{updated_str}\t{entry.link}")
6615</file>
6616
6617</files>