···
+
This file is a merged representation of the entire codebase, combined into a single document by Repomix.
+
This section contains a summary of this file.
+
This file contains a packed representation of the entire repository's contents.
+
It is designed to be easily consumable by AI systems for analysis, code review,
+
or other automated processes.
+
The content is organized as follows:
+
1. This summary section
+
2. Repository information
+
4. Repository files (if enabled)
+
5. Multiple file entries, each consisting of:
+
- File path as an attribute
+
- Full contents of the file
+
- This file should be treated as read-only. Any changes should be made to the
+
original repository files, not this packed version.
+
- When processing this file, use the file path to distinguish
+
between different files in the repository.
+
- Be aware that this file may contain sensitive information. Handle it with
+
the same level of security as you would the original repository.
+
- Some files may have been excluded based on .gitignore rules and Repomix's configuration
+
- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
+
- Files matching patterns in .gitignore are excluded
+
- Files matching default ignore patterns are excluded
+
- Files are sorted by Git change count (files with more changes are at the bottom)
+
This section contains the contents of the repository's files.
+
<file path=".claude/settings.local.json">
+
"enableAllProjectMcpServers": false
+
<file path="src/thicket/cli/commands/generate.py">
+
"""Generate static HTML website from thicket data."""
+
from datetime import datetime
+
from pathlib import Path
+
from typing import Any, Optional, TypedDict, Union
+
from jinja2 import Environment, FileSystemLoader, select_autoescape
+
from rich.progress import Progress, SpinnerColumn, TextColumn
+
from ...core.git_store import GitStore
+
from ...models.feed import AtomEntry
+
from ...models.user import GitStoreIndex, UserMetadata
+
from ..utils import console, load_config
+
class UserData(TypedDict):
+
"""Type definition for user data structure."""
+
recent_entries: list[tuple[str, AtomEntry]]
+
def safe_anchor_id(atom_id: str) -> str:
+
"""Convert an Atom ID to a safe HTML anchor ID."""
+
# Use base64 URL-safe encoding without padding
+
encoded = base64.urlsafe_b64encode(atom_id.encode('utf-8')).decode('ascii').rstrip('=')
+
# Prefix with 'id' to ensure it starts with a letter (HTML requirement)
+
class WebsiteGenerator:
+
"""Generate static HTML website from thicket data."""
+
def __init__(self, git_store: GitStore, output_dir: Path):
+
self.git_store = git_store
+
self.output_dir = output_dir
+
self.template_dir = Path(__file__).parent.parent.parent / "templates"
+
# Initialize Jinja2 environment
+
self.env = Environment(
+
loader=FileSystemLoader(self.template_dir),
+
autoescape=select_autoescape(["html", "xml"]),
+
self.index: Optional[GitStoreIndex] = None
+
self.entries: list[tuple[str, AtomEntry]] = [] # (username, entry)
+
self.links_data: Optional[dict[str, Any]] = None
+
self.threads: list[list[dict[str, Any]]] = [] # List of threads with metadata
+
def get_display_name(self, username: str) -> str:
+
"""Get display name for a user, falling back to username."""
+
if self.index and username in self.index.users:
+
user = self.index.users[username]
+
return user.display_name or username
+
def get_user_homepage(self, username: str) -> Optional[str]:
+
"""Get homepage URL for a user."""
+
if self.index and username in self.index.users:
+
user = self.index.users[username]
+
return str(user.homepage) if user.homepage else None
+
def clean_html_summary(self, content: Optional[str], max_length: int = 200) -> str:
+
"""Clean HTML content and truncate for display in timeline."""
+
clean_text = re.sub(r"<[^>]+>", " ", content)
+
# Replace multiple whitespace with single space
+
clean_text = re.sub(r"\s+", " ", clean_text)
+
# Strip leading/trailing whitespace
+
clean_text = clean_text.strip()
+
# Truncate with ellipsis if needed
+
if len(clean_text) > max_length:
+
# Try to break at word boundary
+
truncated = clean_text[:max_length]
+
last_space = truncated.rfind(" ")
+
last_space > max_length * 0.8
+
): # If we can break reasonably close to the limit
+
clean_text = truncated[:last_space] + "..."
+
clean_text = truncated + "..."
+
def load_data(self) -> None:
+
"""Load all data from the git repository."""
+
TextColumn("[progress.description]{task.description}"),
+
task = progress.add_task("Loading repository index...", total=None)
+
self.index = self.git_store._load_index()
+
raise ValueError("No index found in repository")
+
progress.update(task, completed=True)
+
task = progress.add_task("Loading entries...", total=None)
+
for username, user_metadata in self.index.users.items():
+
user_dir = self.git_store.repo_path / user_metadata.directory
+
for entry_file in user_dir.glob("*.json"):
+
if entry_file.name not in ["index.json", "duplicates.json"]:
+
with open(entry_file) as f:
+
entry_data = json.load(f)
+
entry = AtomEntry(**entry_data)
+
self.entries.append((username, entry))
+
f"[yellow]Warning: Failed to load {entry_file}: {e}[/yellow]"
+
progress.update(task, completed=True)
+
# Sort entries by date (newest first) - prioritize updated over published
+
key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
+
task = progress.add_task("Loading links and references...", total=None)
+
links_file = self.git_store.repo_path / "links.json"
+
if links_file.exists():
+
with open(links_file) as f:
+
self.links_data = json.load(f)
+
progress.update(task, completed=True)
+
def build_threads(self) -> None:
+
"""Build threaded conversations from references."""
+
if not self.links_data or "references" not in self.links_data:
+
# Map entry IDs to (username, entry) tuples
+
entry_map: dict[str, tuple[str, AtomEntry]] = {}
+
for username, entry in self.entries:
+
entry_map[entry.id] = (username, entry)
+
# Build adjacency lists for references
+
self.outbound_refs: dict[str, set[str]] = {}
+
self.inbound_refs: dict[str, set[str]] = {}
+
self.reference_details: dict[
+
str, list[dict[str, Any]]
+
] = {} # Store full reference info
+
for ref in self.links_data["references"]:
+
source_id = ref["source_entry_id"]
+
target_id = ref.get("target_entry_id")
+
if target_id and source_id in entry_map and target_id in entry_map:
+
self.outbound_refs.setdefault(source_id, set()).add(target_id)
+
self.inbound_refs.setdefault(target_id, set()).add(source_id)
+
# Store reference details for UI
+
self.reference_details.setdefault(source_id, []).append(
+
"target_id": target_id,
+
"target_username": ref.get("target_username"),
+
self.reference_details.setdefault(target_id, []).append(
+
"source_id": source_id,
+
"source_username": ref.get("source_username"),
+
# Find conversation threads (multi-post discussions)
+
for entry_id, (_username, _entry) in entry_map.items():
+
if entry_id in processed:
+
# Build thread starting from this entry
+
level_map: dict[str, int] = {} # Track levels for this thread
+
# First, traverse up to find the root
+
while current in self.inbound_refs:
+
parents = self.inbound_refs[current] - {
+
} # Exclude self-references
+
# Take the first parent
+
parent = next(iter(parents))
+
if parent in thread_ids: # Avoid cycles
+
to_visit.insert(0, current)
+
# Now traverse down from the root
+
current = to_visit.pop(0)
+
if current in thread_ids or current not in entry_map:
+
thread_ids.add(current)
+
username, entry = entry_map[current]
+
# Calculate thread level
+
thread_level = self._calculate_thread_level(current, level_map)
+
# Add threading metadata
+
"display_name": self.get_display_name(username),
+
"references_to": list(self.outbound_refs.get(current, [])),
+
"referenced_by": list(self.inbound_refs.get(current, [])),
+
"thread_level": thread_level,
+
thread.append(thread_entry)
+
if current in self.outbound_refs:
+
children = self.outbound_refs[current] - thread_ids # Avoid cycles
+
to_visit.extend(sorted(children))
+
if len(thread) > 1: # Only keep actual threads
+
# Sort thread by date (newest first) - prioritize updated over published
+
thread.sort(key=lambda x: x["entry"].updated or x["entry"].published or datetime.min, reverse=True) # type: ignore
+
self.threads.append(thread)
+
# Sort threads by the date of their most recent entry - prioritize updated over published
+
item["entry"].updated or item["entry"].published or datetime.min for item in t
+
def _calculate_thread_level(
+
self, entry_id: str, processed_entries: dict[str, int]
+
"""Calculate indentation level for threaded display."""
+
if entry_id in processed_entries:
+
return processed_entries[entry_id]
+
if entry_id not in self.inbound_refs:
+
processed_entries[entry_id] = 0
+
parents_in_thread = self.inbound_refs[entry_id] & set(processed_entries.keys())
+
if not parents_in_thread:
+
processed_entries[entry_id] = 0
+
# Find the deepest parent level + 1
+
for parent_id in parents_in_thread:
+
parent_level = self._calculate_thread_level(parent_id, processed_entries)
+
max_parent_level = max(max_parent_level, parent_level)
+
level = min(max_parent_level + 1, 4) # Cap at level 4
+
processed_entries[entry_id] = level
+
def get_standalone_references(self) -> list[dict[str, Any]]:
+
"""Get posts that have references but aren't part of multi-post threads."""
+
if not hasattr(self, "reference_details"):
+
threaded_entry_ids = set()
+
for thread in self.threads:
+
threaded_entry_ids.add(item["entry_id"])
+
for username, entry in self.entries:
+
entry.id in self.reference_details
+
and entry.id not in threaded_entry_ids
+
refs = self.reference_details[entry.id]
+
# Only include if it has meaningful references (not just self-references)
+
if r.get("target_id") != entry.id and r.get("source_id") != entry.id
+
standalone_refs.append(
+
"display_name": self.get_display_name(username),
+
"references": meaningful_refs,
+
def _add_cross_thread_links(self, timeline_items: list[dict[str, Any]]) -> None:
+
"""Add cross-thread linking for entries that appear in multiple threads."""
+
# Map entry IDs to their positions in the timeline
+
entry_positions: dict[str, list[int]] = {}
+
# Map URLs referenced by entries to the entries that reference them
+
url_references: dict[str, list[tuple[str, int]]] = {} # url -> [(entry_id, position)]
+
# First pass: collect all entry IDs, their positions, and referenced URLs
+
for i, item in enumerate(timeline_items):
+
if item["type"] == "post":
+
entry_id = item["content"]["entry"].id
+
entry_positions.setdefault(entry_id, []).append(i)
+
# Track URLs this entry references
+
if entry_id in self.reference_details:
+
for ref in self.reference_details[entry_id]:
+
if ref["type"] == "outbound" and "target_id" in ref:
+
# Find the target entry's URL if available
+
target_entry = self._find_entry_by_id(ref["target_id"])
+
if target_entry and target_entry.link:
+
url = str(target_entry.link)
+
url_references.setdefault(url, []).append((entry_id, i))
+
elif item["type"] == "thread":
+
for thread_item in item["content"]:
+
entry_id = thread_item["entry"].id
+
entry_positions.setdefault(entry_id, []).append(i)
+
# Track URLs this entry references
+
if entry_id in self.reference_details:
+
for ref in self.reference_details[entry_id]:
+
if ref["type"] == "outbound" and "target_id" in ref:
+
target_entry = self._find_entry_by_id(ref["target_id"])
+
if target_entry and target_entry.link:
+
url = str(target_entry.link)
+
url_references.setdefault(url, []).append((entry_id, i))
+
# Build cross-thread connections - only for entries that actually appear multiple times
+
cross_thread_connections: dict[str, set[int]] = {} # entry_id -> set of timeline positions
+
# Add connections ONLY for entries that appear multiple times in the timeline
+
for entry_id, positions in entry_positions.items():
+
cross_thread_connections[entry_id] = set(positions)
+
# Debug: uncomment to see which entries have multiple appearances
+
# print(f"Entry {entry_id[:50]}... appears at positions: {positions}")
+
# Apply cross-thread links to timeline items
+
for entry_id, positions_set in cross_thread_connections.items():
+
positions_list = list(positions_set)
+
for pos in positions_list:
+
item = timeline_items[pos]
+
other_positions = sorted([p for p in positions_list if p != pos])
+
if item["type"] == "post":
+
# Add cross-thread info to individual posts
+
item["content"]["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
+
# Add info about shared references
+
item["content"]["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
+
elif item["type"] == "thread":
+
# Add cross-thread info to thread items
+
for thread_item in item["content"]:
+
if thread_item["entry"].id == entry_id:
+
thread_item["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
+
thread_item["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
+
def _build_cross_thread_link_data(self, entry_id: str, other_positions: list[int], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
+
"""Build detailed cross-thread link data with anchor information."""
+
cross_thread_links = []
+
for pos in other_positions:
+
item = timeline_items[pos]
+
if item["type"] == "post":
+
safe_id = safe_anchor_id(entry_id)
+
cross_thread_links.append({
+
"anchor_id": f"post-{pos}-{safe_id}",
+
"context": "individual post",
+
"title": item["content"]["entry"].title
+
elif item["type"] == "thread":
+
# For thread items, find the specific thread item
+
for thread_idx, thread_item in enumerate(item["content"]):
+
if thread_item["entry"].id == entry_id:
+
safe_id = safe_anchor_id(entry_id)
+
cross_thread_links.append({
+
"anchor_id": f"post-{pos}-{thread_idx}-{safe_id}",
+
"context": f"thread (level {thread_item.get('thread_level', 0)})",
+
"title": thread_item["entry"].title
+
return cross_thread_links
+
def _find_entry_by_id(self, entry_id: str) -> Optional[AtomEntry]:
+
"""Find an entry by its ID."""
+
for _username, entry in self.entries:
+
if entry.id == entry_id:
+
def _get_shared_references(self, entry_id: str, positions: Union[set[int], list[int]], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
+
"""Get information about shared references between cross-thread entries."""
+
# Collect all referenced URLs from entries at these positions
+
url_counts: dict[str, int] = {}
+
referencing_entries: dict[str, list[str]] = {} # url -> [entry_ids]
+
item = timeline_items[pos]
+
if item["type"] == "post":
+
entries_to_check.append(item["content"]["entry"])
+
elif item["type"] == "thread":
+
entries_to_check.extend([ti["entry"] for ti in item["content"]])
+
for entry in entries_to_check:
+
if entry.id in self.reference_details:
+
for ref in self.reference_details[entry.id]:
+
if ref["type"] == "outbound" and "target_id" in ref:
+
target_entry = self._find_entry_by_id(ref["target_id"])
+
if target_entry and target_entry.link:
+
url = str(target_entry.link)
+
url_counts[url] = url_counts.get(url, 0) + 1
+
if url not in referencing_entries:
+
referencing_entries[url] = []
+
if entry.id not in referencing_entries[url]:
+
referencing_entries[url].append(entry.id)
+
# Find URLs referenced by multiple entries
+
for url, count in url_counts.items():
+
if count > 1 and len(referencing_entries[url]) > 1:
+
# Get the target entry info
+
for ref in (self.links_data or {}).get("references", []):
+
if ref.get("target_url") == url:
+
target_username = ref.get("target_username")
+
if ref.get("target_entry_id"):
+
target_entry = self._find_entry_by_id(ref["target_entry_id"])
+
"referencing_entries": referencing_entries[url],
+
"target_username": target_username,
+
"target_title": target_entry.title if target_entry else None
+
return sorted(shared_refs, key=lambda x: x["count"], reverse=True)
+
def generate_site(self) -> None:
+
"""Generate the static website."""
+
# Create output directory
+
self.output_dir.mkdir(parents=True, exist_ok=True)
+
# Create static directories
+
(self.output_dir / "css").mkdir(exist_ok=True)
+
(self.output_dir / "js").mkdir(exist_ok=True)
+
css_template = self.env.get_template("style.css")
+
css_content = css_template.render()
+
with open(self.output_dir / "css" / "style.css", "w") as f:
+
js_template = self.env.get_template("script.js")
+
js_content = js_template.render()
+
with open(self.output_dir / "js" / "script.js", "w") as f:
+
# Prepare common template data
+
"title": "Energy & Environment Group",
+
"generated_at": datetime.now().isoformat(),
+
"get_display_name": self.get_display_name,
+
"get_user_homepage": self.get_user_homepage,
+
"clean_html_summary": self.clean_html_summary,
+
"safe_anchor_id": safe_anchor_id,
+
# Build unified timeline
+
# Only consider the threads that will actually be displayed
+
displayed_threads = self.threads[:20] # Limit to 20 threads
+
# Track which entries are part of displayed threads
+
threaded_entry_ids = set()
+
for thread in displayed_threads:
+
threaded_entry_ids.add(item["entry_id"])
+
# Add threads to timeline (using the date of the most recent post)
+
for thread in displayed_threads:
+
most_recent_date = max(
+
item["entry"].updated or item["entry"].published or datetime.min
+
timeline_items.append({
+
"date": most_recent_date,
+
# Add individual posts (not in threads)
+
for username, entry in self.entries[:50]:
+
if entry.id not in threaded_entry_ids:
+
# Check if this entry has references
+
entry.id in self.reference_details
+
if hasattr(self, "reference_details")
+
refs = self.reference_details.get(entry.id, [])
+
if r.get("target_id") != entry.id
+
and r.get("source_id") != entry.id
+
timeline_items.append({
+
"date": entry.updated or entry.published or datetime.min,
+
"display_name": self.get_display_name(username),
+
"references": refs if refs else None
+
# Sort unified timeline by date (newest first)
+
timeline_items.sort(key=lambda x: x["date"], reverse=True)
+
# Limit timeline to what will actually be rendered
+
timeline_items = timeline_items[:50] # Limit to 50 items total
+
# Add cross-thread linking for repeat blog references
+
self._add_cross_thread_links(timeline_items)
+
# Prepare outgoing links data
+
if self.links_data and "links" in self.links_data:
+
for url, link_info in self.links_data["links"].items():
+
referencing_entries = []
+
for entry_id in link_info.get("referencing_entries", []):
+
for username, entry in self.entries:
+
if entry.id == entry_id:
+
referencing_entries.append(
+
(self.get_display_name(username), entry)
+
if referencing_entries:
+
# Sort by date - prioritize updated over published
+
referencing_entries.sort(
+
key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
+
"target_username": link_info.get("target_username"),
+
"entries": referencing_entries,
+
# Sort links by most recent reference - prioritize updated over published
+
key=lambda x: x["entries"][0][1].updated
+
or x["entries"][0][1].published or datetime.min,
+
users: list[UserData] = []
+
for username, user_metadata in self.index.users.items():
+
# Get recent entries for this user with display names
+
(self.get_display_name(u), e)
+
for u, e in self.entries
+
{"metadata": user_metadata, "recent_entries": user_entries}
+
users.sort(key=lambda x: x["metadata"].entry_count, reverse=True)
+
# Generate timeline page
+
timeline_template = self.env.get_template("timeline.html")
+
timeline_content = timeline_template.render(
+
timeline_items=timeline_items, # Already limited above
+
with open(self.output_dir / "timeline.html", "w") as f:
+
f.write(timeline_content)
+
links_template = self.env.get_template("links.html")
+
links_content = links_template.render(
+
outgoing_links=outgoing_links[:100],
+
with open(self.output_dir / "links.html", "w") as f:
+
users_template = self.env.get_template("users.html")
+
users_content = users_template.render(
+
with open(self.output_dir / "users.html", "w") as f:
+
# Generate main index page (redirect to timeline)
+
index_template = self.env.get_template("index.html")
+
index_content = index_template.render(**base_data)
+
with open(self.output_dir / "index.html", "w") as f:
+
console.print(f"[green]✓[/green] Generated website at {self.output_dir}")
+
console.print(f" - {len(self.entries)} entries")
+
console.print(f" - {len(self.threads)} conversation threads")
+
console.print(f" - {len(outgoing_links)} outgoing links")
+
console.print(f" - {len(users)} users")
+
" - Generated pages: index.html, timeline.html, links.html, users.html"
+
output: Path = typer.Option(
+
Path("./thicket-site"),
+
help="Output directory for the generated website",
+
force: bool = typer.Option(
+
False, "--force", "-f", help="Overwrite existing output directory"
+
config_file: Path = typer.Option(
+
Path("thicket.yaml"), "--config", help="Configuration file path"
+
"""Generate a static HTML website from thicket data."""
+
config = load_config(config_file)
+
if not config.git_store:
+
console.print("[red]No git store path configured[/red]")
+
git_store = GitStore(config.git_store)
+
# Check if output directory exists
+
if output.exists() and not force:
+
f"[red]Output directory {output} already exists. Use --force to overwrite.[/red]"
+
# Clean output directory if forcing
+
if output.exists() and force:
+
generator = WebsiteGenerator(git_store, output)
+
console.print("[bold]Generating static website...[/bold]")
+
generator.build_threads()
+
generator.generate_site()
+
console.print(f"[red]Error generating website: {e}[/red]")
+
raise typer.Exit(1) from e
+
<file path="src/thicket/templates/base.html">
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
<title>{% block page_title %}{{ title }}{% endblock %}</title>
+
<link rel="stylesheet" href="css/style.css">
+
<header class="site-header">
+
<div class="header-content">
+
<h1 class="site-title">{{ title }}</h1>
+
<a href="timeline.html" class="nav-link {% if page == 'timeline' %}active{% endif %}">Timeline</a>
+
<a href="links.html" class="nav-link {% if page == 'links' %}active{% endif %}">Links</a>
+
<a href="users.html" class="nav-link {% if page == 'users' %}active{% endif %}">Users</a>
+
<main class="main-content">
+
{% block content %}{% endblock %}
+
<footer class="site-footer">
+
<p>Generated on {{ generated_at }} by <a href="https://github.com/avsm/thicket">Thicket</a></p>
+
<script src="js/script.js"></script>
+
<file path="src/thicket/templates/index.html">
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
<title>{{ title }}</title>
+
<meta http-equiv="refresh" content="0; url=timeline.html">
+
<link rel="canonical" href="timeline.html">
+
<p>Redirecting to <a href="timeline.html">Timeline</a>...</p>
+
<file path="src/thicket/templates/links.html">
+
{% extends "base.html" %}
+
{% block page_title %}Outgoing Links - {{ title }}{% endblock %}
+
<div class="page-content">
+
<h2>Outgoing Links</h2>
+
<p class="page-description">External links referenced in blog posts, ordered by most recent reference.</p>
+
{% for link in outgoing_links %}
+
<article class="link-group">
+
<a href="{{ link.url }}" target="_blank">{{ link.url|truncate(80) }}</a>
+
{% if link.target_username %}
+
<span class="target-user">({{ link.target_username }})</span>
+
<div class="referencing-entries">
+
<span class="ref-count">Referenced in {{ link.entries|length }} post(s):</span>
+
{% for display_name, entry in link.entries[:5] %}
+
<span class="author">{{ display_name }}</span> -
+
<a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
+
<time datetime="{{ entry.updated or entry.published }}">
+
({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
+
{% if link.entries|length > 5 %}
+
<li class="more">... and {{ link.entries|length - 5 }} more</li>
+
<file path="src/thicket/templates/script.js">
+
// Enhanced functionality for thicket website
+
document.addEventListener('DOMContentLoaded', function() {
+
// Enhance thread collapsing (optional feature)
+
const threadHeaders = document.querySelectorAll('.thread-header');
+
threadHeaders.forEach(header => {
+
header.style.cursor = 'pointer';
+
header.addEventListener('click', function() {
+
const thread = this.parentElement;
+
const entries = thread.querySelectorAll('.thread-entry');
+
// Toggle visibility of all but the first entry
+
for (let i = 1; i < entries.length; i++) {
+
entries[i].style.display = entries[i].style.display === 'none' ? 'block' : 'none';
+
// Update thread count text
+
const count = this.querySelector('.thread-count');
+
if (entries[1] && entries[1].style.display === 'none') {
+
count.textContent = count.textContent.replace('posts', 'posts (collapsed)');
+
count.textContent = count.textContent.replace(' (collapsed)', '');
+
// Add relative time display
+
const timeElements = document.querySelectorAll('time');
+
timeElements.forEach(timeEl => {
+
const datetime = new Date(timeEl.getAttribute('datetime'));
+
const now = new Date();
+
const diffMs = now - datetime;
+
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
+
const diffHours = Math.floor(diffMs / (1000 * 60 * 60));
+
const diffMinutes = Math.floor(diffMs / (1000 * 60));
+
relativeTime = diffMinutes === 0 ? 'just now' : `${diffMinutes}m ago`;
+
relativeTime = `${diffHours}h ago`;
+
} else if (diffDays === 1) {
+
relativeTime = 'yesterday';
+
} else if (diffDays < 7) {
+
relativeTime = `${diffDays}d ago`;
+
} else if (diffDays < 30) {
+
const weeks = Math.floor(diffDays / 7);
+
relativeTime = weeks === 1 ? '1w ago' : `${weeks}w ago`;
+
} else if (diffDays < 365) {
+
const months = Math.floor(diffDays / 30);
+
relativeTime = months === 1 ? '1mo ago' : `${months}mo ago`;
+
const years = Math.floor(diffDays / 365);
+
relativeTime = years === 1 ? '1y ago' : `${years}y ago`;
+
// Add relative time as title attribute
+
timeEl.setAttribute('title', timeEl.textContent);
+
timeEl.textContent = relativeTime;
+
// Enhanced anchor link scrolling for shared references
+
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
+
anchor.addEventListener('click', function (e) {
+
const target = document.querySelector(this.getAttribute('href'));
+
target.scrollIntoView({
+
// Highlight the target briefly
+
const timelineEntry = target.closest('.timeline-entry');
+
timelineEntry.style.outline = '2px solid var(--primary-color)';
+
timelineEntry.style.borderRadius = '8px';
+
timelineEntry.style.outline = '';
+
timelineEntry.style.borderRadius = '';
+
<file path="src/thicket/templates/style.css">
+
/* Modern, clean design with high-density text and readable theme */
+
--primary-color: #2c3e50;
+
--secondary-color: #3498db;
+
--accent-color: #e74c3c;
+
--text-primary: #2c3e50;
+
--text-secondary: #7f8c8d;
+
--border-color: #e0e0e0;
+
box-sizing: border-box;
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif;
+
color: var(--text-primary);
+
background-color: var(--background);
+
background-color: var(--surface);
+
border-bottom: 1px solid var(--border-color);
+
max-width: var(--max-width);
+
justify-content: space-between;
+
color: var(--primary-color);
+
color: var(--text-secondary);
+
padding: 0.5rem 0.75rem;
+
transition: all 0.2s ease;
+
color: var(--primary-color);
+
background-color: var(--background);
+
color: var(--secondary-color);
+
background-color: var(--background);
+
max-width: var(--max-width);
+
color: var(--text-secondary);
+
margin-bottom: 0.75rem;
+
color: var(--primary-color);
+
margin-bottom: 0.75rem;
+
color: var(--primary-color);
+
/* Entries and Threads */
+
background-color: var(--surface);
+
border: 1px solid var(--border-color);
+
/* Timeline-style entries */
+
padding: 0.5rem 0.75rem;
+
background: transparent;
+
transition: background-color 0.2s ease;
+
.timeline-entry:hover {
+
background-color: var(--surface);
+
color: var(--text-secondary);
+
margin-bottom: 0.25rem;
+
font-family: 'SF Mono', Monaco, Consolas, 'Courier New', monospace;
+
color: var(--text-secondary);
+
color: var(--primary-color);
+
.timeline-author:hover {
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--primary-color);
+
.timeline-title a:hover {
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--text-secondary);
+
/* Legacy styles for other sections */
+
.entry-meta, .thread-header {
+
color: var(--text-secondary);
+
color: var(--primary-color);
+
color: var(--primary-color);
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--text-primary);
+
/* Enhanced Threading Styles */
+
/* Conversation Clusters */
+
.conversation-cluster {
+
background-color: var(--background);
+
border: 2px solid var(--border-color);
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
+
background: linear-gradient(135deg, var(--surface) 0%, #f1f3f4 100%);
+
border-bottom: 1px solid var(--border-color);
+
justify-content: space-between;
+
color: var(--secondary-color);
+
.conversation-participants {
+
color: var(--text-secondary);
+
/* Threaded Conversation Entries */
+
margin-bottom: 0.75rem;
+
align-items: flex-start;
+
.conversation-entry.level-0 {
+
.conversation-entry.level-1 {
+
.conversation-entry.level-2 {
+
.conversation-entry.level-3 {
+
.conversation-entry.level-4 {
+
background-color: var(--secondary-color);
+
.conversation-entry.level-0 .entry-connector {
+
background-color: var(--accent-color);
+
background-color: var(--surface);
+
border: 1px solid var(--border-color);
+
transition: all 0.2s ease;
+
border-color: var(--secondary-color);
+
box-shadow: 0 2px 8px rgba(52, 152, 219, 0.1);
+
/* Reference Indicators */
+
.reference-indicators {
+
background-color: #e8f5e8;
+
background-color: #e8f0ff;
+
/* Reference Badges for Individual Posts */
+
.timeline-entry.with-references {
+
background-color: var(--surface);
+
/* Conversation posts in unified timeline */
+
.timeline-entry.conversation-post {
+
background: transparent;
+
padding: 0.5rem 0.75rem;
+
.timeline-entry.conversation-post.level-0 {
+
border-left: 2px solid var(--accent-color);
+
.timeline-entry.conversation-post.level-1 {
+
border-left: 2px solid var(--secondary-color);
+
.timeline-entry.conversation-post.level-2 {
+
border-left: 2px solid var(--text-secondary);
+
.timeline-entry.conversation-post.level-3 {
+
border-left: 2px solid var(--text-secondary);
+
.timeline-entry.conversation-post.level-4 {
+
border-left: 2px solid var(--text-secondary);
+
/* Cross-thread linking */
+
border-top: 1px solid var(--border-color);
+
.cross-thread-indicator {
+
color: var(--text-secondary);
+
background-color: var(--surface);
+
padding: 0.25rem 0.5rem;
+
border: 1px solid var(--border-color);
+
/* Inline shared references styling */
+
color: var(--text-secondary);
+
color: var(--primary-color);
+
transition: color 0.2s ease;
+
.shared-ref-link:hover {
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--text-secondary);
+
.user-anchor, .post-anchor {
+
margin-top: -60px; /* Offset for fixed header */
+
color: var(--primary-color);
+
transition: color 0.2s ease;
+
.cross-thread-link:hover {
+
color: var(--secondary-color);
+
text-decoration: underline;
+
padding: 0.1rem 0.4rem;
+
text-transform: uppercase;
+
letter-spacing: 0.05em;
+
.ref-badge.ref-outbound {
+
background-color: #e8f5e8;
+
border: 1px solid #c3e6c3;
+
.ref-badge.ref-inbound {
+
background-color: #e8f0ff;
+
border: 1px solid #b3d9ff;
+
/* Author Color Coding */
+
.timeline-author::before {
+
background-color: var(--secondary-color);
+
/* Generate consistent colors for authors */
+
.author-avsm::before { background-color: #e74c3c; }
+
.author-mort::before { background-color: #3498db; }
+
.author-mte::before { background-color: #2ecc71; }
+
.author-ryan::before { background-color: #f39c12; }
+
.author-mwd::before { background-color: #9b59b6; }
+
.author-dra::before { background-color: #1abc9c; }
+
.author-pf341::before { background-color: #34495e; }
+
.author-sadiqj::before { background-color: #e67e22; }
+
.author-martinkl::before { background-color: #8e44ad; }
+
.author-jonsterling::before { background-color: #27ae60; }
+
.author-jon::before { background-color: #f1c40f; }
+
.author-onkar::before { background-color: #e91e63; }
+
.author-gabriel::before { background-color: #00bcd4; }
+
.author-jess::before { background-color: #ff5722; }
+
.author-ibrahim::before { background-color: #607d8b; }
+
.author-andres::before { background-color: #795548; }
+
.author-eeg::before { background-color: #ff9800; }
+
.conversations-section h3,
+
.referenced-posts-section h3,
+
.individual-posts-section h3 {
+
border-bottom: 2px solid var(--border-color);
+
padding-bottom: 0.5rem;
+
.conversations-section h3::before {
+
.referenced-posts-section h3::before {
+
.individual-posts-section h3::before {
+
/* Legacy thread styles (for backward compatibility) */
+
background-color: var(--background);
+
border: 1px solid var(--border-color);
+
background-color: var(--surface);
+
padding: 0.5rem 0.75rem;
+
border-bottom: 1px solid var(--border-color);
+
color: var(--secondary-color);
+
padding: 0.5rem 0.75rem;
+
border-bottom: 1px solid var(--border-color);
+
.thread-entry:last-child {
+
margin-left: var(--thread-indent);
+
border-left: 3px solid var(--secondary-color);
+
background-color: var(--surface);
+
background-color: var(--background);
+
word-break: break-word;
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--text-secondary);
+
color: var(--text-secondary);
+
.referencing-entries ul {
+
.referencing-entries li {
+
margin-bottom: 0.25rem;
+
.referencing-entries .more {
+
color: var(--text-secondary);
+
background-color: var(--background);
+
margin-bottom: 0.25rem;
+
color: var(--text-secondary);
+
color: var(--text-secondary);
+
color: var(--secondary-color);
+
text-decoration: underline;
+
color: var(--text-secondary);
+
margin-bottom: 0.25rem;
+
max-width: var(--max-width);
+
margin: 3rem auto 2rem;
+
color: var(--text-secondary);
+
border-top: 1px solid var(--border-color);
+
color: var(--secondary-color);
+
text-decoration: underline;
+
@media (max-width: 768px) {
+
flex-direction: column;
+
align-items: flex-start;
+
margin-left: calc(var(--thread-indent) / 2);
+
flex-direction: column;
+
<file path="src/thicket/templates/timeline.html">
+
{% extends "base.html" %}
+
{% block page_title %}Timeline - {{ title }}{% endblock %}
+
{% set seen_users = [] %}
+
<div class="page-content">
+
<h2>Recent Posts & Conversations</h2>
+
<section class="unified-timeline">
+
{% for item in timeline_items %}
+
{% if item.type == "post" %}
+
<!-- Individual Post -->
+
<article class="timeline-entry {% if item.content.references %}with-references{% endif %}">
+
<div class="timeline-meta">
+
<time datetime="{{ item.content.entry.updated or item.content.entry.published }}" class="timeline-time">
+
{{ (item.content.entry.updated or item.content.entry.published).strftime('%Y-%m-%d %H:%M') }}
+
{% set homepage = get_user_homepage(item.content.username) %}
+
{% if item.content.username not in seen_users %}
+
<a id="{{ item.content.username }}" class="user-anchor"></a>
+
{% set _ = seen_users.append(item.content.username) %}
+
<a id="post-{{ loop.index0 }}-{{ safe_anchor_id(item.content.entry.id) }}" class="post-anchor"></a>
+
<a href="{{ homepage }}" target="_blank" class="timeline-author">{{ item.content.display_name }}</a>
+
<span class="timeline-author">{{ item.content.display_name }}</span>
+
{% if item.content.references %}
+
<div class="reference-badges">
+
{% for ref in item.content.references %}
+
{% if ref.type == 'outbound' %}
+
<span class="ref-badge ref-outbound" title="References {{ ref.target_username or 'external post' }}">
+
→ {{ ref.target_username or 'ext' }}
+
{% elif ref.type == 'inbound' %}
+
<span class="ref-badge ref-inbound" title="Referenced by {{ ref.source_username or 'external post' }}">
+
← {{ ref.source_username or 'ext' }}
+
<div class="timeline-content">
+
<strong class="timeline-title">
+
<a href="{{ item.content.entry.link }}" target="_blank">{{ item.content.entry.title }}</a>
+
{% if item.content.entry.summary %}
+
<span class="timeline-summary">— {{ clean_html_summary(item.content.entry.summary, 250) }}</span>
+
{% if item.content.shared_references %}
+
<span class="inline-shared-refs">
+
{% for ref in item.content.shared_references[:3] %}
+
{% if ref.target_username %}
+
<a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
+
{% if item.content.shared_references|length > 3 %}
+
<span class="shared-ref-more">+{{ item.content.shared_references|length - 3 }} more</span>
+
{% if item.content.cross_thread_links %}
+
<div class="cross-thread-links">
+
<span class="cross-thread-indicator">🔗 Also appears: </span>
+
{% for link in item.content.cross_thread_links %}
+
<a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
+
{% elif item.type == "thread" %}
+
<!-- Conversation Thread -->
+
{% set outer_loop_index = loop.index0 %}
+
{% for thread_item in item.content %}
+
<article class="timeline-entry conversation-post level-{{ thread_item.thread_level }}">
+
<div class="timeline-meta">
+
<time datetime="{{ thread_item.entry.updated or thread_item.entry.published }}" class="timeline-time">
+
{{ (thread_item.entry.updated or thread_item.entry.published).strftime('%Y-%m-%d %H:%M') }}
+
{% set homepage = get_user_homepage(thread_item.username) %}
+
{% if thread_item.username not in seen_users %}
+
<a id="{{ thread_item.username }}" class="user-anchor"></a>
+
{% set _ = seen_users.append(thread_item.username) %}
+
<a id="post-{{ outer_loop_index }}-{{ loop.index0 }}-{{ safe_anchor_id(thread_item.entry.id) }}" class="post-anchor"></a>
+
<a href="{{ homepage }}" target="_blank" class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</a>
+
<span class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</span>
+
{% if thread_item.references_to or thread_item.referenced_by %}
+
<span class="reference-indicators">
+
{% if thread_item.references_to %}
+
<span class="ref-out" title="References other posts">→</span>
+
{% if thread_item.referenced_by %}
+
<span class="ref-in" title="Referenced by other posts">←</span>
+
<div class="timeline-content">
+
<strong class="timeline-title">
+
<a href="{{ thread_item.entry.link }}" target="_blank">{{ thread_item.entry.title }}</a>
+
{% if thread_item.entry.summary %}
+
<span class="timeline-summary">— {{ clean_html_summary(thread_item.entry.summary, 300) }}</span>
+
{% if thread_item.shared_references %}
+
<span class="inline-shared-refs">
+
{% for ref in thread_item.shared_references[:3] %}
+
{% if ref.target_username %}
+
<a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
+
{% if thread_item.shared_references|length > 3 %}
+
<span class="shared-ref-more">+{{ thread_item.shared_references|length - 3 }} more</span>
+
{% if thread_item.cross_thread_links %}
+
<div class="cross-thread-links">
+
<span class="cross-thread-indicator">🔗 Also appears: </span>
+
{% for link in thread_item.cross_thread_links %}
+
<a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
+
<file path="src/thicket/templates/users.html">
+
{% extends "base.html" %}
+
{% block page_title %}Users - {{ title }}{% endblock %}
+
<div class="page-content">
+
<p class="page-description">All users contributing to this thicket, ordered by post count.</p>
+
{% for user_info in users %}
+
<article class="user-card">
+
<div class="user-header">
+
{% if user_info.metadata.icon and user_info.metadata.icon != "None" %}
+
<img src="{{ user_info.metadata.icon }}" alt="{{ user_info.metadata.username }}" class="user-icon">
+
<div class="user-info">
+
{% if user_info.metadata.display_name %}
+
{{ user_info.metadata.display_name }}
+
<span class="username">({{ user_info.metadata.username }})</span>
+
{{ user_info.metadata.username }}
+
<div class="user-meta">
+
{% if user_info.metadata.homepage %}
+
<a href="{{ user_info.metadata.homepage }}" target="_blank">{{ user_info.metadata.homepage }}</a>
+
{% if user_info.metadata.email %}
+
<span class="separator">•</span>
+
<a href="mailto:{{ user_info.metadata.email }}">{{ user_info.metadata.email }}</a>
+
<span class="separator">•</span>
+
<span class="post-count">{{ user_info.metadata.entry_count }} posts</span>
+
{% if user_info.recent_entries %}
+
<div class="user-recent">
+
{% for display_name, entry in user_info.recent_entries %}
+
<a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
+
<time datetime="{{ entry.updated or entry.published }}">
+
({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
+
<file path="README.md">
+
A modern CLI tool for persisting Atom/RSS feeds in Git repositories, designed to enable distributed webblog comment structures.
+
- **Feed Auto-Discovery**: Automatically extracts user metadata from Atom/RSS feeds
+
- **Git Storage**: Stores feed entries in a Git repository with full history
+
- **Duplicate Management**: Manual curation of duplicate entries across feeds
+
- **Modern CLI**: Built with Typer and Rich for beautiful terminal output
+
- **Comprehensive Parsing**: Supports RSS 0.9x, RSS 1.0, RSS 2.0, and Atom feeds
+
- **Cron-Friendly**: Designed for scheduled execution
+
# Or install with dev dependencies
+
1. **Initialize a new thicket repository:**
+
thicket init ./my-feeds
+
2. **Add a user with their feed:**
+
thicket add user "alice" --feed "https://alice.example.com/feed.xml"
+
3. **Sync feeds to download entries:**
+
4. **List users and feeds:**
+
thicket init <git-store-path> [--cache-dir <path>] [--config <config-file>]
+
### Add Users and Feeds
+
# Add user with auto-discovery
+
thicket add user "username" --feed "https://example.com/feed.xml"
+
# Add user with manual metadata
+
thicket add user "username" \
+
--feed "https://example.com/feed.xml" \
+
--email "user@example.com" \
+
--homepage "https://example.com" \
+
--display-name "User Name"
+
# Add additional feed to existing user
+
thicket add feed "username" "https://example.com/other-feed.xml"
+
thicket sync --user "username"
+
# Dry run (preview changes)
+
thicket sync --all --dry-run
+
# List feeds for specific user
+
thicket list feeds --user "username"
+
thicket list entries --limit 20
+
# List entries for specific user
+
thicket list entries --user "username"
+
# List duplicate mappings
+
thicket duplicates list
+
# Mark entries as duplicates
+
thicket duplicates add "https://example.com/dup" "https://example.com/canonical"
+
# Remove duplicate mapping
+
thicket duplicates remove "https://example.com/dup"
+
Thicket uses a YAML configuration file (default: `thicket.yaml`):
+
git_store: ./feeds-repo
+
cache_dir: ~/.cache/thicket
+
- https://alice.example.com/feed.xml
+
email: alice@example.com
+
homepage: https://alice.example.com
+
## Git Repository Structure
+
├── index.json # User directory index
+
├── duplicates.json # Duplicate entry mappings
+
│ ├── metadata.json # User metadata
+
│ ├── entry_id_1.json # Feed entries
+
# Install in development mode
+
- **CLI**: Modern interface with Typer and Rich
+
- **Feed Processing**: Universal parsing with feedparser
+
- **Git Storage**: Structured storage with GitPython
+
- **Data Models**: Pydantic for validation and serialization
+
- **Async HTTP**: httpx for efficient feed fetching
+
- **Blog Aggregation**: Collect and archive blog posts from multiple sources
+
- **Comment Networks**: Enable distributed commenting systems
+
- **Feed Archival**: Preserve feed history beyond typical feed depth limits
+
- **Content Curation**: Manage and deduplicate content across feeds
+
MIT License - see LICENSE file for details.
+
<file path="src/thicket/cli/commands/index_cmd.py">
+
"""CLI command for building reference index from blog entries."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.console import Console
+
from rich.progress import (
+
from rich.table import Table
+
from ...core.git_store import GitStore
+
from ...core.reference_parser import ReferenceIndex, ReferenceParser
+
from ..utils import get_tsv_mode, load_config
+
config_file: Optional[Path] = typer.Option(
+
help="Path to configuration file",
+
output_file: Optional[Path] = typer.Option(
+
help="Path to output index file (default: updates links.json in git store)",
+
verbose: bool = typer.Option(
+
help="Show detailed progress information",
+
"""Build a reference index showing which blog entries reference others.
+
This command analyzes all blog entries to detect cross-references between
+
different blogs, creating an index that can be used to build threaded
+
views of related content.
+
Updates the unified links.json file with reference data.
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Initialize reference parser
+
parser = ReferenceParser()
+
# Build user domain mapping
+
console.print("Building user domain mapping...")
+
user_domains = parser.build_user_domain_mapping(git_store)
+
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
+
# Initialize reference index
+
ref_index = ReferenceIndex()
+
ref_index.user_domains = user_domains
+
index = git_store._load_index()
+
users = list(index.users.keys())
+
console.print("[yellow]No users found in Git store[/yellow]")
+
TextColumn("[progress.description]{task.description}"),
+
# Count total entries first
+
counting_task = progress.add_task("Counting entries...", total=len(users))
+
entries = git_store.list_entries(username)
+
entry_counts[username] = len(entries)
+
total_entries += len(entries)
+
progress.advance(counting_task)
+
progress.remove_task(counting_task)
+
# Process entries - extract references
+
processing_task = progress.add_task(
+
f"Extracting references from {total_entries} entries...",
+
entries = git_store.list_entries(username)
+
# Extract references from this entry
+
references = parser.extract_references(entry, username, user_domains)
+
all_references.extend(references)
+
progress.advance(processing_task)
+
if verbose and references:
+
console.print(f" Found {len(references)} references in {username}:{entry.title[:50]}...")
+
progress.remove_task(processing_task)
+
# Resolve target_entry_ids for references
+
resolve_task = progress.add_task(
+
f"Resolving {len(all_references)} references...",
+
total=len(all_references)
+
console.print(f"Resolving target entry IDs for {len(all_references)} references...")
+
resolved_references = parser.resolve_target_entry_ids(all_references, git_store)
+
# Count resolved references
+
resolved_count = sum(1 for ref in resolved_references if ref.target_entry_id is not None)
+
console.print(f"Resolved {resolved_count} out of {len(all_references)} references")
+
# Add resolved references to index
+
for ref in resolved_references:
+
ref_index.add_reference(ref)
+
progress.advance(resolve_task)
+
progress.remove_task(resolve_task)
+
# Determine output path
+
output_path = output_file
+
output_path = config.git_store / "links.json"
+
# Load existing links data or create new structure
+
if output_path.exists() and not output_file:
+
# Load existing unified structure
+
with open(output_path) as f:
+
existing_data = json.load(f)
+
# Update with reference data
+
existing_data["references"] = ref_index.to_dict()["references"]
+
existing_data["user_domains"] = {k: list(v) for k, v in user_domains.items()}
+
# Save updated structure
+
with open(output_path, "w") as f:
+
json.dump(existing_data, f, indent=2, default=str)
+
console.print("\n[green]✓ Reference index built successfully[/green]")
+
# Create summary table or TSV output
+
print(f"Total Users\t{len(users)}")
+
print(f"Total Entries\t{total_entries}")
+
print(f"Total References\t{total_references}")
+
print(f"Outbound Refs\t{len(ref_index.outbound_refs)}")
+
print(f"Inbound Refs\t{len(ref_index.inbound_refs)}")
+
print(f"Output File\t{output_path}")
+
table = Table(title="Reference Index Summary")
+
table.add_column("Metric", style="cyan")
+
table.add_column("Count", style="green")
+
table.add_row("Total Users", str(len(users)))
+
table.add_row("Total Entries", str(total_entries))
+
table.add_row("Total References", str(total_references))
+
table.add_row("Outbound Refs", str(len(ref_index.outbound_refs)))
+
table.add_row("Inbound Refs", str(len(ref_index.inbound_refs)))
+
table.add_row("Output File", str(output_path))
+
# Show some interesting statistics
+
if total_references > 0:
+
console.print("\n[bold]Reference Statistics:[/bold]")
+
# Most referenced users
+
unresolved_domains = set()
+
for ref in ref_index.references:
+
if ref.target_username:
+
target_counts[ref.target_username] = target_counts.get(ref.target_username, 0) + 1
+
# Track unresolved domains
+
from urllib.parse import urlparse
+
domain = urlparse(ref.target_url).netloc.lower()
+
unresolved_domains.add(domain)
+
print("Referenced User\tReference Count")
+
for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
+
print(f"{username}\t{count}")
+
console.print("\nMost referenced users:")
+
for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
+
console.print(f" {username}: {count} references")
+
if unresolved_domains and verbose:
+
print("Unresolved Domain\tCount")
+
for domain in sorted(list(unresolved_domains)[:10]):
+
if len(unresolved_domains) > 10:
+
print(f"... and {len(unresolved_domains) - 10} more\t...")
+
console.print(f"\nUnresolved domains: {len(unresolved_domains)}")
+
for domain in sorted(list(unresolved_domains)[:10]):
+
console.print(f" {domain}")
+
if len(unresolved_domains) > 10:
+
console.print(f" ... and {len(unresolved_domains) - 10} more")
+
console.print(f"[red]Error building reference index: {e}[/red]")
+
console.print_exception()
+
config_file: Optional[Path] = typer.Option(
+
help="Path to configuration file",
+
index_file: Optional[Path] = typer.Option(
+
help="Path to reference index file (default: links.json in git store)",
+
username: Optional[str] = typer.Option(
+
help="Show threads for specific username only",
+
entry_id: Optional[str] = typer.Option(
+
help="Show thread for specific entry ID",
+
min_size: int = typer.Option(
+
help="Minimum thread size to display",
+
"""Show threaded view of related blog entries.
+
This command uses the reference index to show which blog entries
+
are connected through cross-references, creating an email-style
+
threaded view of the conversation.
+
Reads reference data from the unified links.json file.
+
config = load_config(config_file)
+
# Determine index file path
+
index_path = index_file
+
index_path = config.git_store / "links.json"
+
if not index_path.exists():
+
console.print(f"[red]Links file not found: {index_path}[/red]")
+
console.print("Run 'thicket links' and 'thicket index' first to build the reference index")
+
with open(index_path) as f:
+
unified_data = json.load(f)
+
# Check if references exist in the unified structure
+
if "references" not in unified_data:
+
console.print(f"[red]No references found in {index_path}[/red]")
+
console.print("Run 'thicket index' first to build the reference index")
+
# Extract reference data and reconstruct ReferenceIndex
+
ref_index = ReferenceIndex.from_dict({
+
"references": unified_data["references"],
+
"user_domains": unified_data.get("user_domains", {})
+
# Initialize Git store to get entry details
+
git_store = GitStore(config.git_store)
+
if entry_id and username:
+
thread_members = ref_index.get_thread_members(username, entry_id)
+
_display_thread(thread_members, ref_index, git_store, f"Thread for {username}:{entry_id}")
+
# Show all threads involving this user
+
user_index = git_store._load_index()
+
user = user_index.get_user(username)
+
console.print(f"[red]User not found: {username}[/red]")
+
entries = git_store.list_entries(username)
+
console.print(f"[bold]Threads involving {username}:[/bold]\n")
+
thread_members = ref_index.get_thread_members(username, entry.id)
+
if len(thread_members) >= min_size:
+
thread_key = tuple(sorted(thread_members))
+
if thread_key not in threads_found:
+
threads_found.add(thread_key)
+
_display_thread(thread_members, ref_index, git_store, f"Thread #{len(threads_found)}")
+
console.print("[bold]All conversation threads:[/bold]\n")
+
processed_entries = set()
+
user_index = git_store._load_index()
+
for username in user_index.users.keys():
+
entries = git_store.list_entries(username)
+
entry_key = (username, entry.id)
+
if entry_key in processed_entries:
+
thread_members = ref_index.get_thread_members(username, entry.id)
+
if len(thread_members) >= min_size:
+
thread_key = tuple(sorted(thread_members))
+
if thread_key not in all_threads:
+
all_threads.add(thread_key)
+
_display_thread(thread_members, ref_index, git_store, f"Thread #{len(all_threads)}")
+
# Mark all members as processed
+
for member in thread_members:
+
processed_entries.add(member)
+
console.print("[yellow]No conversation threads found[/yellow]")
+
console.print(f"(minimum thread size: {min_size})")
+
console.print(f"[red]Error showing threads: {e}[/red]")
+
def _display_thread(thread_members, ref_index, git_store, title):
+
"""Display a single conversation thread."""
+
console.print(f"[bold cyan]{title}[/bold cyan]")
+
console.print(f"Thread size: {len(thread_members)} entries")
+
# Get entry details for each member
+
for username, entry_id in thread_members:
+
entry = git_store.get_entry(username, entry_id)
+
thread_entries.append((username, entry))
+
# Sort by publication date
+
thread_entries.sort(key=lambda x: x[1].published or x[1].updated)
+
for i, (username, entry) in enumerate(thread_entries):
+
prefix = "├─" if i < len(thread_entries) - 1 else "└─"
+
# Get references for this entry
+
outbound = ref_index.get_outbound_refs(username, entry.id)
+
inbound = ref_index.get_inbound_refs(username, entry.id)
+
if outbound or inbound:
+
ref_info = f" ({len(outbound)} out, {len(inbound)} in)"
+
console.print(f" {prefix} [{username}] {entry.title[:60]}...{ref_info}")
+
console.print(f" Published: {entry.published.strftime('%Y-%m-%d')}")
+
console.print() # Empty line after each thread
+
<file path="src/thicket/cli/commands/info_cmd.py">
+
"""CLI command for displaying detailed information about a specific atom entry."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.console import Console
+
from rich.panel import Panel
+
from rich.table import Table
+
from rich.text import Text
+
from ...core.git_store import GitStore
+
from ...core.reference_parser import ReferenceIndex
+
from ..utils import load_config, get_tsv_mode
+
identifier: str = typer.Argument(
+
help="The atom ID or URL of the entry to display information about"
+
username: Optional[str] = typer.Option(
+
help="Username to search for the entry (if not provided, searches all users)"
+
config_file: Optional[Path] = typer.Option(
+
help="Path to configuration file",
+
show_content: bool = typer.Option(
+
help="Include the full content of the entry in the output"
+
"""Display detailed information about a specific atom entry.
+
You can specify the entry using either its atom ID or URL.
+
Shows all metadata for the given entry, including title, dates, categories,
+
and summarizes all inbound and outbound links to/from other posts.
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Check if identifier looks like a URL
+
is_url = identifier.startswith(('http://', 'https://'))
+
# Search specific username
+
entries = git_store.list_entries(username)
+
if str(e.link) == identifier:
+
found_username = username
+
entry = git_store.get_entry(username, identifier)
+
found_username = username
+
index = git_store._load_index()
+
for user in index.users.keys():
+
entries = git_store.list_entries(user)
+
if str(e.link) == identifier:
+
entry = git_store.get_entry(user, identifier)
+
if not entry or not found_username:
+
console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found for user '{username}'[/red]")
+
console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found in any user's entries[/red]")
+
# Load reference index if available
+
links_path = config.git_store / "links.json"
+
if links_path.exists():
+
with open(links_path) as f:
+
unified_data = json.load(f)
+
# Check if references exist in the unified structure
+
if "references" in unified_data:
+
ref_index = ReferenceIndex.from_dict({
+
"references": unified_data["references"],
+
"user_domains": unified_data.get("user_domains", {})
+
_display_entry_info_tsv(entry, found_username, ref_index, show_content)
+
_display_entry_info(entry, found_username)
+
_display_link_info(entry, found_username, ref_index)
+
console.print("\n[yellow]No reference index found. Run 'thicket links' and 'thicket index' to build cross-reference data.[/yellow]")
+
# Optionally display content
+
if show_content and entry.content:
+
_display_content(entry.content)
+
console.print(f"[red]Error displaying entry info: {e}[/red]")
+
def _display_entry_info(entry, username: str) -> None:
+
"""Display basic entry information in a structured format."""
+
# Create main info panel
+
info_table = Table.grid(padding=(0, 2))
+
info_table.add_column("Field", style="cyan bold", width=15)
+
info_table.add_column("Value", style="white")
+
info_table.add_row("User", f"[green]{username}[/green]")
+
info_table.add_row("Atom ID", f"[blue]{entry.id}[/blue]")
+
info_table.add_row("Title", entry.title)
+
info_table.add_row("Link", str(entry.link))
+
info_table.add_row("Published", entry.published.strftime("%Y-%m-%d %H:%M:%S UTC"))
+
info_table.add_row("Updated", entry.updated.strftime("%Y-%m-%d %H:%M:%S UTC"))
+
# Truncate long summaries
+
summary = entry.summary[:200] + "..." if len(entry.summary) > 200 else entry.summary
+
info_table.add_row("Summary", summary)
+
categories_text = ", ".join(entry.categories)
+
info_table.add_row("Categories", categories_text)
+
if "name" in entry.author:
+
author_info.append(entry.author["name"])
+
if "email" in entry.author:
+
author_info.append(f"<{entry.author['email']}>")
+
info_table.add_row("Author", " ".join(author_info))
+
info_table.add_row("Content Type", entry.content_type)
+
info_table.add_row("Rights", entry.rights)
+
info_table.add_row("Source Feed", entry.source)
+
title=f"[bold]Entry Information[/bold]",
+
def _display_link_info(entry, username: str, ref_index: ReferenceIndex) -> None:
+
"""Display inbound and outbound link information."""
+
outbound_refs = ref_index.get_outbound_refs(username, entry.id)
+
inbound_refs = ref_index.get_inbound_refs(username, entry.id)
+
if not outbound_refs and not inbound_refs:
+
console.print("\n[dim]No cross-references found for this entry.[/dim]")
+
links_table = Table(title="Cross-References")
+
links_table.add_column("Direction", style="cyan", width=10)
+
links_table.add_column("Target/Source", style="green", width=20)
+
links_table.add_column("URL", style="blue", width=50)
+
# Add outbound references
+
for ref in outbound_refs:
+
target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
+
links_table.add_row("→ Out", target_info, ref.target_url)
+
# Add inbound references
+
for ref in inbound_refs:
+
source_info = f"{ref.source_username}:{ref.source_entry_id}"
+
links_table.add_row("← In", source_info, ref.target_url)
+
console.print(links_table)
+
console.print(f"\n[bold]Summary:[/bold] {len(outbound_refs)} outbound, {len(inbound_refs)} inbound references")
+
def _display_content(content: str) -> None:
+
"""Display the full content of the entry."""
+
# Truncate very long content
+
display_content = content
+
if len(content) > 5000:
+
display_content = content[:5000] + "\n\n[... content truncated ...]"
+
title="[bold]Entry Content[/bold]",
+
def _display_entry_info_tsv(entry, username: str, ref_index: Optional[ReferenceIndex], show_content: bool) -> None:
+
"""Display entry information in TSV format."""
+
print(f"User\t{username}")
+
print(f"Atom ID\t{entry.id}")
+
print(f"Title\t{entry.title.replace(chr(9), ' ').replace(chr(10), ' ').replace(chr(13), ' ')}")
+
print(f"Link\t{entry.link}")
+
print(f"Published\t{entry.published.strftime('%Y-%m-%d %H:%M:%S UTC')}")
+
print(f"Updated\t{entry.updated.strftime('%Y-%m-%d %H:%M:%S UTC')}")
+
# Escape tabs and newlines in summary
+
summary = entry.summary.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
+
print(f"Summary\t{summary}")
+
print(f"Categories\t{', '.join(entry.categories)}")
+
if "name" in entry.author:
+
author_info.append(entry.author["name"])
+
if "email" in entry.author:
+
author_info.append(f"<{entry.author['email']}>")
+
print(f"Author\t{' '.join(author_info)}")
+
print(f"Content Type\t{entry.content_type}")
+
print(f"Rights\t{entry.rights}")
+
print(f"Source Feed\t{entry.source}")
+
# Add reference info if available
+
outbound_refs = ref_index.get_outbound_refs(username, entry.id)
+
inbound_refs = ref_index.get_inbound_refs(username, entry.id)
+
print(f"Outbound References\t{len(outbound_refs)}")
+
print(f"Inbound References\t{len(inbound_refs)}")
+
for ref in outbound_refs:
+
target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
+
print(f"Outbound Reference\t{target_info}\t{ref.target_url}")
+
for ref in inbound_refs:
+
source_info = f"{ref.source_username}:{ref.source_entry_id}"
+
print(f"Inbound Reference\t{source_info}\t{ref.target_url}")
+
# Show content if requested
+
if show_content and entry.content:
+
# Escape tabs and newlines in content
+
content = entry.content.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
+
print(f"Content\t{content}")
+
<file path="src/thicket/cli/commands/init.py">
+
"""Initialize command for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from pydantic import ValidationError
+
from ...core.git_store import GitStore
+
from ...models import ThicketConfig
+
from ..utils import print_error, print_success, save_config
+
git_store: Path = typer.Argument(..., help="Path to Git repository for storing feeds"),
+
cache_dir: Optional[Path] = typer.Option(
+
None, "--cache-dir", "-c", help="Cache directory (default: ~/.cache/thicket)"
+
config_file: Optional[Path] = typer.Option(
+
None, "--config", help="Configuration file path (default: thicket.yaml)"
+
force: bool = typer.Option(
+
False, "--force", "-f", help="Overwrite existing configuration"
+
"""Initialize a new thicket configuration and Git store."""
+
from platformdirs import user_cache_dir
+
cache_dir = Path(user_cache_dir("thicket"))
+
if config_file is None:
+
config_file = Path("thicket.yaml")
+
# Check if config already exists
+
if config_file.exists() and not force:
+
print_error(f"Configuration file already exists: {config_file}")
+
print_error("Use --force to overwrite")
+
# Create cache directory
+
cache_dir.mkdir(parents=True, exist_ok=True)
+
print_success(f"Initialized Git store at: {git_store}")
+
print_error(f"Failed to initialize Git store: {e}")
+
raise typer.Exit(1) from e
+
config = ThicketConfig(
+
save_config(config, config_file)
+
print_success(f"Created configuration file: {config_file}")
+
except ValidationError as e:
+
print_error(f"Invalid configuration: {e}")
+
raise typer.Exit(1) from e
+
print_error(f"Failed to create configuration: {e}")
+
raise typer.Exit(1) from e
+
print_success("Thicket initialized successfully!")
+
print_success(f"Git store: {git_store}")
+
print_success(f"Cache directory: {cache_dir}")
+
print_success(f"Configuration: {config_file}")
+
print_success("Run 'thicket add user' to add your first user and feed.")
+
<file path="src/thicket/cli/__init__.py">
+
"""CLI interface for thicket."""
+
<file path="src/thicket/core/__init__.py">
+
"""Core business logic for thicket."""
+
from .feed_parser import FeedParser
+
from .git_store import GitStore
+
__all__ = ["FeedParser", "GitStore"]
+
<file path="src/thicket/core/feed_parser.py">
+
"""Feed parsing and normalization with auto-discovery."""
+
from datetime import datetime
+
from typing import Optional
+
from urllib.parse import urlparse
+
from pydantic import HttpUrl, ValidationError
+
from ..models import AtomEntry, FeedMetadata
+
"""Parser for RSS/Atom feeds with normalization and auto-discovery."""
+
def __init__(self, user_agent: str = "thicket/0.1.0"):
+
"""Initialize the feed parser."""
+
self.user_agent = user_agent
+
"a", "abbr", "acronym", "b", "blockquote", "br", "code", "em",
+
"i", "li", "ol", "p", "pre", "strong", "ul", "h1", "h2", "h3",
+
"h4", "h5", "h6", "img", "div", "span",
+
self.allowed_attributes = {
+
"a": ["href", "title"],
+
"img": ["src", "alt", "title", "width", "height"],
+
"blockquote": ["cite"],
+
async def fetch_feed(self, url: HttpUrl) -> str:
+
"""Fetch feed content from URL."""
+
async with httpx.AsyncClient() as client:
+
response = await client.get(
+
headers={"User-Agent": self.user_agent},
+
response.raise_for_status()
+
def parse_feed(self, content: str, source_url: Optional[HttpUrl] = None) -> tuple[FeedMetadata, list[AtomEntry]]:
+
"""Parse feed content and return metadata and entries."""
+
parsed = feedparser.parse(content)
+
if parsed.bozo and parsed.bozo_exception:
+
# Try to continue with potentially malformed feed
+
# Extract feed metadata
+
feed_meta = self._extract_feed_metadata(parsed.feed)
+
# Extract and normalize entries
+
for entry in parsed.entries:
+
atom_entry = self._normalize_entry(entry, source_url)
+
entries.append(atom_entry)
+
# Log error but continue processing other entries
+
print(f"Error processing entry {getattr(entry, 'id', 'unknown')}: {e}")
+
return feed_meta, entries
+
def _extract_feed_metadata(self, feed: feedparser.FeedParserDict) -> FeedMetadata:
+
"""Extract metadata from feed for auto-discovery."""
+
# Parse author information
+
if hasattr(feed, 'author_detail'):
+
author_name = feed.author_detail.get('name')
+
author_email = feed.author_detail.get('email')
+
author_uri = feed.author_detail.get('href')
+
elif hasattr(feed, 'author'):
+
author_name = feed.author
+
# Parse managing editor for RSS feeds
+
if not author_email and hasattr(feed, 'managingEditor'):
+
author_email = feed.managingEditor
+
if hasattr(feed, 'link'):
+
feed_link = HttpUrl(feed.link)
+
except ValidationError:
+
# Parse image/icon/logo
+
if hasattr(feed, 'image'):
+
image_url = HttpUrl(feed.image.get('href', feed.image.get('url', '')))
+
except (ValidationError, AttributeError):
+
if hasattr(feed, 'icon'):
+
icon = HttpUrl(feed.icon)
+
except ValidationError:
+
if hasattr(feed, 'logo'):
+
logo = HttpUrl(feed.logo)
+
except ValidationError:
+
title=getattr(feed, 'title', None),
+
author_name=author_name,
+
author_email=author_email,
+
author_uri=HttpUrl(author_uri) if author_uri else None,
+
description=getattr(feed, 'description', None),
+
def _normalize_entry(self, entry: feedparser.FeedParserDict, source_url: Optional[HttpUrl] = None) -> AtomEntry:
+
"""Normalize an entry to Atom format."""
+
updated = self._parse_timestamp(entry.get('updated_parsed') or entry.get('published_parsed'))
+
published = self._parse_timestamp(entry.get('published_parsed'))
+
content = self._extract_content(entry)
+
content_type = self._extract_content_type(entry)
+
author = self._extract_author(entry)
+
# Parse categories/tags
+
if hasattr(entry, 'tags'):
+
categories = [tag.get('term', '') for tag in entry.tags if tag.get('term')]
+
# Sanitize HTML content
+
content = self._sanitize_html(content)
+
summary = entry.get('summary', '')
+
summary = self._sanitize_html(summary)
+
id=entry.get('id', entry.get('link', '')),
+
title=entry.get('title', ''),
+
link=HttpUrl(entry.get('link', '')),
+
summary=summary or None,
+
content=content or None,
+
content_type=content_type,
+
rights=entry.get('rights', None),
+
source=str(source_url) if source_url else None,
+
def _parse_timestamp(self, time_struct) -> datetime:
+
"""Parse feedparser time struct to datetime."""
+
return datetime(*time_struct[:6])
+
def _extract_content(self, entry: feedparser.FeedParserDict) -> Optional[str]:
+
"""Extract the best content from an entry."""
+
# Prefer content over summary
+
if hasattr(entry, 'content') and entry.content:
+
# Find the best content (prefer text/html, then text/plain)
+
for content_item in entry.content:
+
if content_item.get('type') in ['text/html', 'html']:
+
return content_item.get('value', '')
+
elif content_item.get('type') in ['text/plain', 'text']:
+
return content_item.get('value', '')
+
# Fallback to first content item
+
return entry.content[0].get('value', '')
+
return entry.get('summary', '')
+
def _extract_content_type(self, entry: feedparser.FeedParserDict) -> str:
+
"""Extract content type from entry."""
+
if hasattr(entry, 'content') and entry.content:
+
content_type = entry.content[0].get('type', 'html')
+
# Normalize content type
+
if content_type in ['text/html', 'html']:
+
elif content_type in ['text/plain', 'text']:
+
elif content_type == 'xhtml':
+
def _extract_author(self, entry: feedparser.FeedParserDict) -> Optional[dict]:
+
"""Extract author information from entry."""
+
if hasattr(entry, 'author_detail'):
+
'name': entry.author_detail.get('name'),
+
'email': entry.author_detail.get('email'),
+
'uri': entry.author_detail.get('href'),
+
elif hasattr(entry, 'author'):
+
author['name'] = entry.author
+
return author if author else None
+
def _sanitize_html(self, html: str) -> str:
+
"""Sanitize HTML content to prevent XSS."""
+
tags=self.allowed_tags,
+
attributes=self.allowed_attributes,
+
def sanitize_entry_id(self, entry_id: str) -> str:
+
"""Sanitize entry ID to be a safe filename."""
+
# Parse URL to get meaningful parts
+
parsed = urlparse(entry_id)
+
# Start with the path component
+
# Remove leading slash and replace problematic characters
+
safe_id = parsed.path.lstrip('/').replace('/', '_').replace('\\', '_')
+
# Use the entire ID as fallback
+
# Replace problematic characters
+
if char.isalnum() or char in '-_.':
+
safe_chars.append(char)
+
safe_id = ''.join(safe_chars)
+
# Ensure it's not too long (max 200 chars)
+
safe_id = safe_id[:200]
+
# Ensure it's not empty
+
<file path="src/thicket/core/reference_parser.py">
+
"""Reference detection and parsing for blog entries."""
+
from typing import Optional
+
from urllib.parse import urlparse
+
from ..models import AtomEntry
+
"""Represents a reference from one blog entry to another."""
+
target_username: Optional[str] = None,
+
target_entry_id: Optional[str] = None,
+
self.source_entry_id = source_entry_id
+
self.source_username = source_username
+
self.target_url = target_url
+
self.target_username = target_username
+
self.target_entry_id = target_entry_id
+
def to_dict(self) -> dict:
+
"""Convert to dictionary for JSON serialization."""
+
"source_entry_id": self.source_entry_id,
+
"source_username": self.source_username,
+
"target_url": self.target_url,
+
# Only include optional fields if they are not None
+
if self.target_username is not None:
+
result["target_username"] = self.target_username
+
if self.target_entry_id is not None:
+
result["target_entry_id"] = self.target_entry_id
+
def from_dict(cls, data: dict) -> "BlogReference":
+
"""Create from dictionary."""
+
source_entry_id=data["source_entry_id"],
+
source_username=data["source_username"],
+
target_url=data["target_url"],
+
target_username=data.get("target_username"),
+
target_entry_id=data.get("target_entry_id"),
+
"""Index of blog-to-blog references for creating threaded views."""
+
self.references: list[BlogReference] = []
+
self.outbound_refs: dict[
+
str, list[BlogReference]
+
] = {} # entry_id -> outbound refs
+
self.inbound_refs: dict[
+
str, list[BlogReference]
+
] = {} # entry_id -> inbound refs
+
self.user_domains: dict[str, set[str]] = {} # username -> set of domains
+
def add_reference(self, ref: BlogReference) -> None:
+
"""Add a reference to the index."""
+
self.references.append(ref)
+
# Update outbound references
+
source_key = f"{ref.source_username}:{ref.source_entry_id}"
+
if source_key not in self.outbound_refs:
+
self.outbound_refs[source_key] = []
+
self.outbound_refs[source_key].append(ref)
+
# Update inbound references if we can identify the target
+
if ref.target_username and ref.target_entry_id:
+
target_key = f"{ref.target_username}:{ref.target_entry_id}"
+
if target_key not in self.inbound_refs:
+
self.inbound_refs[target_key] = []
+
self.inbound_refs[target_key].append(ref)
+
def get_outbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
+
"""Get all outbound references from an entry."""
+
key = f"{username}:{entry_id}"
+
return self.outbound_refs.get(key, [])
+
def get_inbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
+
"""Get all inbound references to an entry."""
+
key = f"{username}:{entry_id}"
+
return self.inbound_refs.get(key, [])
+
def get_thread_members(self, username: str, entry_id: str) -> set[tuple[str, str]]:
+
"""Get all entries that are part of the same thread."""
+
to_visit = [(username, entry_id)]
+
current_user, current_entry = to_visit.pop()
+
if (current_user, current_entry) in visited:
+
visited.add((current_user, current_entry))
+
thread_members.add((current_user, current_entry))
+
# Add outbound references
+
for ref in self.get_outbound_refs(current_user, current_entry):
+
if ref.target_username and ref.target_entry_id:
+
to_visit.append((ref.target_username, ref.target_entry_id))
+
# Add inbound references
+
for ref in self.get_inbound_refs(current_user, current_entry):
+
to_visit.append((ref.source_username, ref.source_entry_id))
+
def to_dict(self) -> dict:
+
"""Convert to dictionary for JSON serialization."""
+
"references": [ref.to_dict() for ref in self.references],
+
"user_domains": {k: list(v) for k, v in self.user_domains.items()},
+
def from_dict(cls, data: dict) -> "ReferenceIndex":
+
"""Create from dictionary."""
+
for ref_data in data.get("references", []):
+
ref = BlogReference.from_dict(ref_data)
+
index.add_reference(ref)
+
for username, domains in data.get("user_domains", {}).items():
+
index.user_domains[username] = set(domains)
+
"""Parses blog entries to detect references to other blogs."""
+
# Common blog platforms and patterns
+
r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*", # Common blog domains
+
r"https?://[^/]+\.github\.io/.*", # GitHub Pages
+
r"https?://[^/]+\.substack\.com/.*", # Substack
+
r"https?://medium\.com/.*", # Medium
+
r"https?://[^/]+\.wordpress\.com/.*", # WordPress.com
+
r"https?://[^/]+\.blogspot\.com/.*", # Blogger
+
# Compile regex patterns
+
self.link_pattern = re.compile(
+
r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL
+
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
+
def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:
+
"""Extract all links from HTML content."""
+
# Extract links from <a> tags
+
for match in self.link_pattern.finditer(html_content):
+
r"<[^>]+>", "", match.group(2)
+
).strip() # Remove HTML tags from link text
+
links.append((url, text))
+
def is_blog_url(self, url: str) -> bool:
+
"""Check if a URL likely points to a blog post."""
+
for pattern in self.blog_patterns:
+
if re.match(pattern, url):
+
def _is_likely_blog_post_url(self, url: str) -> bool:
+
"""Check if a same-domain URL likely points to a blog post (not CSS, images, etc.)."""
+
parsed_url = urlparse(url)
+
path = parsed_url.path.lower()
+
# Skip obvious non-blog content
+
if any(path.endswith(ext) for ext in ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.pdf', '.xml', '.json']):
+
# Skip common non-blog paths
+
if any(segment in path for segment in ['/static/', '/assets/', '/css/', '/js/', '/images/', '/img/', '/media/', '/uploads/']):
+
# Skip fragment-only links (same page anchors)
+
if not path or path == '/':
+
# Look for positive indicators of blog posts
+
# Common blog post patterns: dates, slugs, post indicators
+
r'/\d{4}/', # Year in path
+
r'/\d{4}/\d{2}/', # Year/month in path
+
for pattern in blog_indicators:
+
if re.search(pattern, path):
+
# If it has a reasonable path depth and doesn't match exclusions, likely a blog post
+
path_segments = [seg for seg in path.split('/') if seg]
+
return len(path_segments) >= 1 # At least one meaningful path segment
+
def resolve_target_user(
+
self, url: str, user_domains: dict[str, set[str]]
+
"""Try to resolve a URL to a known user based on domain mapping."""
+
parsed_url = urlparse(url)
+
domain = parsed_url.netloc.lower()
+
for username, domains in user_domains.items():
+
def extract_references(
+
self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]]
+
) -> list[BlogReference]:
+
"""Extract all blog references from an entry."""
+
# Combine all text content for analysis
+
content_to_search.append(entry.content)
+
content_to_search.append(entry.summary)
+
for content in content_to_search:
+
links = self.extract_links_from_html(content)
+
for url, _link_text in links:
+
urlparse(str(entry.link)).netloc.lower() if entry.link else ""
+
link_domain = urlparse(url).netloc.lower()
+
# Check if this looks like a blog URL
+
if not self.is_blog_url(url):
+
# For same-domain links, apply additional filtering to avoid non-blog content
+
if link_domain == entry_domain:
+
# Only include same-domain links that look like blog posts
+
if not self._is_likely_blog_post_url(url):
+
# Try to resolve to a known user
+
if link_domain == entry_domain:
+
# Same domain - target user is the same as source user
+
target_username: Optional[str] = username
+
# Different domain - try to resolve
+
target_username = self.resolve_target_user(url, user_domains)
+
source_entry_id=entry.id,
+
source_username=username,
+
target_username=target_username,
+
target_entry_id=None, # Will be resolved later if possible
+
def build_user_domain_mapping(self, git_store: "GitStore") -> dict[str, set[str]]:
+
"""Build mapping of usernames to their known domains."""
+
index = git_store._load_index()
+
for username, user_metadata in index.users.items():
+
# Add domains from feeds
+
for feed_url in user_metadata.feeds:
+
domain = urlparse(feed_url).netloc.lower()
+
# Add domain from homepage
+
if user_metadata.homepage:
+
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
+
user_domains[username] = domains
+
def _build_url_to_entry_mapping(self, git_store: "GitStore") -> dict[str, str]:
+
"""Build a comprehensive mapping from URLs to entry IDs using git store data.
+
This creates a bidirectional mapping that handles:
+
- Entry link URLs -> Entry IDs
+
- URL variations (with/without www, http/https)
+
- Multiple URLs pointing to the same entry
+
url_to_entry: dict[str, str] = {}
+
# Load index to get all users
+
index = git_store._load_index()
+
for username in index.users.keys():
+
entries = git_store.list_entries(username)
+
link_url = str(entry.link)
+
# Map the canonical link URL
+
url_to_entry[link_url] = entry_id
+
# Handle common URL variations
+
parsed = urlparse(link_url)
+
if parsed.netloc and parsed.path:
+
# Add version without www
+
if parsed.netloc.startswith('www.'):
+
no_www_url = f"{parsed.scheme}://{parsed.netloc[4:]}{parsed.path}"
+
no_www_url += f"?{parsed.query}"
+
no_www_url += f"#{parsed.fragment}"
+
url_to_entry[no_www_url] = entry_id
+
# Add version with www if not present
+
elif not parsed.netloc.startswith('www.'):
+
www_url = f"{parsed.scheme}://www.{parsed.netloc}{parsed.path}"
+
www_url += f"?{parsed.query}"
+
www_url += f"#{parsed.fragment}"
+
url_to_entry[www_url] = entry_id
+
# Add http/https variations
+
if parsed.scheme == 'https':
+
http_url = link_url.replace('https://', 'http://', 1)
+
url_to_entry[http_url] = entry_id
+
elif parsed.scheme == 'http':
+
https_url = link_url.replace('http://', 'https://', 1)
+
url_to_entry[https_url] = entry_id
+
def _normalize_url(self, url: str) -> str:
+
"""Normalize URL for consistent matching.
+
Handles common variations like trailing slashes, fragments, etc.
+
# Remove trailing slash from path
+
path = parsed.path.rstrip('/') if parsed.path != '/' else parsed.path
+
# Reconstruct without fragment for consistent matching
+
normalized = f"{parsed.scheme}://{parsed.netloc}{path}"
+
normalized += f"?{parsed.query}"
+
def resolve_target_entry_ids(
+
self, references: list[BlogReference], git_store: "GitStore"
+
) -> list[BlogReference]:
+
"""Resolve target_entry_id for references using comprehensive URL mapping."""
+
# Build comprehensive URL to entry ID mapping
+
url_to_entry = self._build_url_to_entry_mapping(git_store)
+
# If we already have a target_entry_id, keep the reference as-is
+
if ref.target_entry_id is not None:
+
resolved_refs.append(ref)
+
# If we don't have a target_username, we can't resolve it
+
if ref.target_username is None:
+
resolved_refs.append(ref)
+
# Try to resolve using URL mapping
+
resolved_entry_id = None
+
# First, try exact match
+
if ref.target_url in url_to_entry:
+
resolved_entry_id = url_to_entry[ref.target_url]
+
# Try normalized URL matching
+
normalized_target = self._normalize_url(ref.target_url)
+
if normalized_target in url_to_entry:
+
resolved_entry_id = url_to_entry[normalized_target]
+
for mapped_url, entry_id in url_to_entry.items():
+
if self._normalize_url(mapped_url) == normalized_target:
+
resolved_entry_id = entry_id
+
# Verify the resolved entry belongs to the target username
+
# Double-check by loading the actual entry
+
entries = git_store.list_entries(ref.target_username)
+
entry_found = any(entry.id == resolved_entry_id for entry in entries)
+
resolved_entry_id = None
+
# Create a new reference with the resolved target_entry_id
+
resolved_ref = BlogReference(
+
source_entry_id=ref.source_entry_id,
+
source_username=ref.source_username,
+
target_url=ref.target_url,
+
target_username=ref.target_username,
+
target_entry_id=resolved_entry_id,
+
resolved_refs.append(resolved_ref)
+
<file path="src/thicket/models/__init__.py">
+
"""Data models for thicket."""
+
from .config import ThicketConfig, UserConfig
+
from .feed import AtomEntry, DuplicateMap, FeedMetadata
+
from .user import GitStoreIndex, UserMetadata
+
<file path="src/thicket/models/feed.py">
+
"""Feed and entry models for thicket."""
+
from datetime import datetime
+
from typing import TYPE_CHECKING, Optional
+
from pydantic import BaseModel, ConfigDict, EmailStr, HttpUrl
+
from .config import UserConfig
+
class AtomEntry(BaseModel):
+
"""Represents an Atom feed entry stored in the Git repository."""
+
model_config = ConfigDict(
+
json_encoders={datetime: lambda v: v.isoformat()},
+
str_strip_whitespace=True,
+
id: str # Original Atom ID
+
published: Optional[datetime] = None
+
summary: Optional[str] = None
+
content: Optional[str] = None # Full body content from Atom entry
+
content_type: Optional[str] = "html" # text, html, xhtml
+
author: Optional[dict] = None
+
categories: list[str] = []
+
rights: Optional[str] = None # Copyright info
+
source: Optional[str] = None # Source feed URL
+
class FeedMetadata(BaseModel):
+
"""Metadata extracted from a feed for auto-discovery."""
+
title: Optional[str] = None
+
author_name: Optional[str] = None
+
author_email: Optional[EmailStr] = None
+
author_uri: Optional[HttpUrl] = None
+
link: Optional[HttpUrl] = None
+
logo: Optional[HttpUrl] = None
+
icon: Optional[HttpUrl] = None
+
image_url: Optional[HttpUrl] = None
+
description: Optional[str] = None
+
def to_user_config(self, username: str, feed_url: HttpUrl) -> "UserConfig":
+
"""Convert discovered metadata to UserConfig with fallbacks."""
+
from .config import UserConfig
+
display_name=self.author_name or self.title,
+
email=self.author_email,
+
homepage=self.author_uri or self.link,
+
icon=self.logo or self.icon or self.image_url,
+
class DuplicateMap(BaseModel):
+
"""Maps duplicate entry IDs to canonical entry IDs."""
+
duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
+
comment: str = "Entry IDs that map to the same canonical content"
+
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
+
"""Add a duplicate mapping."""
+
self.duplicates[duplicate_id] = canonical_id
+
def remove_duplicate(self, duplicate_id: str) -> bool:
+
"""Remove a duplicate mapping. Returns True if existed."""
+
return self.duplicates.pop(duplicate_id, None) is not None
+
def get_canonical(self, entry_id: str) -> str:
+
"""Get canonical ID for an entry (returns original if not duplicate)."""
+
return self.duplicates.get(entry_id, entry_id)
+
def is_duplicate(self, entry_id: str) -> bool:
+
"""Check if entry ID is marked as duplicate."""
+
return entry_id in self.duplicates
+
def get_duplicates_for_canonical(self, canonical_id: str) -> list[str]:
+
"""Get all duplicate IDs that map to a canonical ID."""
+
for duplicate_id, canonical in self.duplicates.items()
+
if canonical == canonical_id
+
<file path="src/thicket/models/user.py">
+
"""User metadata models for thicket."""
+
from datetime import datetime
+
from typing import Optional
+
from pydantic import BaseModel, ConfigDict
+
class UserMetadata(BaseModel):
+
"""Metadata about a user stored in the Git repository."""
+
model_config = ConfigDict(
+
json_encoders={datetime: lambda v: v.isoformat()},
+
str_strip_whitespace=True,
+
display_name: Optional[str] = None
+
email: Optional[str] = None
+
homepage: Optional[str] = None
+
icon: Optional[str] = None
+
directory: str # Directory name in Git store
+
def update_timestamp(self) -> None:
+
"""Update the last_updated timestamp to now."""
+
self.last_updated = datetime.now()
+
def increment_entry_count(self, count: int = 1) -> None:
+
"""Increment the entry count by the given amount."""
+
self.entry_count += count
+
self.update_timestamp()
+
class GitStoreIndex(BaseModel):
+
"""Index of all users and their directories in the Git store."""
+
model_config = ConfigDict(
+
json_encoders={datetime: lambda v: v.isoformat()}
+
users: dict[str, UserMetadata] = {} # username -> UserMetadata
+
def add_user(self, user_metadata: UserMetadata) -> None:
+
"""Add or update a user in the index."""
+
self.users[user_metadata.username] = user_metadata
+
self.last_updated = datetime.now()
+
def remove_user(self, username: str) -> bool:
+
"""Remove a user from the index. Returns True if user existed."""
+
if username in self.users:
+
del self.users[username]
+
self.last_updated = datetime.now()
+
def get_user(self, username: str) -> Optional[UserMetadata]:
+
"""Get user metadata by username."""
+
return self.users.get(username)
+
def update_entry_count(self, username: str, count: int) -> None:
+
"""Update entry count for a user and total."""
+
user = self.get_user(username)
+
user.increment_entry_count(count)
+
self.total_entries += count
+
self.last_updated = datetime.now()
+
def recalculate_totals(self) -> None:
+
"""Recalculate total entries from all users."""
+
self.total_entries = sum(user.entry_count for user in self.users.values())
+
self.last_updated = datetime.now()
+
<file path="src/thicket/utils/__init__.py">
+
"""Utility modules for thicket."""
+
# This module will contain shared utilities
+
# For now, it's empty but can be expanded with common functions
+
<file path="src/thicket/__init__.py">
+
"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
+
__email__ = "thicket@example.com"
+
<file path="src/thicket/__main__.py">
+
"""Entry point for running thicket as a module."""
+
from .cli.main import app
+
if __name__ == "__main__":
+
<file path=".gitignore">
+
# Byte-compiled / optimized / DLL files
+
# Distribution / packaging
+
# Usually these files are written by a python script from a template
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
+
pip-delete-this-directory.txt
+
# Unit test / coverage reports
+
# For a library or package, you might want to ignore these files since the code is
+
# intended to run in multiple environments; otherwise, check them in:
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
+
# install all needed dependencies.
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
+
# commonly ignored for libraries.
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
+
# commonly ignored for libraries.
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+
# in the .venv directory. It is recommended not to include this directory in version control.
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+
# SageMath parsed files
+
# Spyder project settings
+
# Rope project settings
+
# pytype static type analyzer
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+
# Abstra is an AI-powered process automation framework.
+
# Ignore directories containing user credentials, local state, and settings.
+
# Learn more at https://abstra.io/docs
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
+
# you could uncomment the following to ignore the entire vscode folder
+
# PyPI configuration file
+
.streamlit/secrets.toml
+
<file path="CLAUDE.md">
+
My goal is to build a CLI tool called thicket in Python that maintains a Git repository within which Atom feeds can be persisted, including their contents.
+
# Python Environment and Package Management
+
This project uses `uv` for Python package management and virtual environment handling.
+
ALWAYS use `uv run` to execute Python commands:
+
- Run the CLI: `uv run -m thicket`
+
- Run tests: `uv run pytest`
+
- Type checking: `uv run mypy src/`
+
- Linting: `uv run ruff check src/`
+
- Format code: `uv run ruff format src/`
+
- Compile check: `uv run python -m py_compile <file>`
+
- Add dependencies: `uv add <package>`
+
- Add dev dependencies: `uv add --dev <package>`
+
- Install dependencies: `uv sync`
+
- Update dependencies: `uv lock --upgrade`
+
The configuration file specifies:
+
- the location of a git store
+
- a list of usernames and target Atom/RSS feed(s) and optional metadata about the username such as their email, homepage, icon and display name
+
- a cache directory to store temporary results such as feed downloads and their last modification date that speed up operations across runs of the tool
+
The Git data store should:
+
- have a subdirectory per user
+
- within that directory, an entry per Atom entry indexed by the Atom id for that entry. The id should be sanitised consistently to be a safe filename. RSS feed should be normalized to Atom before storing it.
+
- within each entry file, the metadata of the Atom feed converted into a JSON format that preserves as much metadata as possible.
+
- have a JSON file in the Git repository that indexes the users, their associated directories within the Git repository, and any other metadata about that user from the config file
+
The CLI should be modern and use cool progress bars and any otfrom ecosystem libraries.
+
The intention behind the Git repository is that it can be queried by other websites in order to build a webblog structure of comments that link to other blogs.
+
<file path="pyproject.toml">
+
requires = ["hatchling"]
+
build-backend = "hatchling.build"
+
description = "A CLI tool for persisting Atom/RSS feeds in Git repositories"
+
requires-python = ">=3.9"
+
{name = "thicket", email = "thicket@example.com"},
+
"Development Status :: 3 - Alpha",
+
"Intended Audience :: Developers",
+
"License :: OSI Approved :: MIT License",
+
"Operating System :: OS Independent",
+
"Programming Language :: Python :: 3",
+
"Programming Language :: Python :: 3.9",
+
"Programming Language :: Python :: 3.10",
+
"Programming Language :: Python :: 3.11",
+
"Programming Language :: Python :: 3.12",
+
"Programming Language :: Python :: 3.13",
+
"Topic :: Internet :: WWW/HTTP :: Dynamic Content :: News/Diary",
+
"Topic :: Software Development :: Version Control :: Git",
+
"Topic :: Text Processing :: Markup :: XML",
+
"pydantic-settings>=2.10.0",
+
[project.optional-dependencies]
+
"pytest-asyncio>=0.24.0",
+
Homepage = "https://github.com/example/thicket"
+
Documentation = "https://github.com/example/thicket"
+
Repository = "https://github.com/example/thicket"
+
"Bug Tracker" = "https://github.com/example/thicket/issues"
+
thicket = "thicket.cli.main:app"
+
path = "src/thicket/__init__.py"
+
[tool.hatch.build.targets.wheel]
+
packages = ["src/thicket"]
+
target-version = ['py39']
+
target-version = "py39"
+
"E", # pycodestyle errors
+
"W", # pycodestyle warnings
+
"C4", # flake8-comprehensions
+
"E501", # line too long, handled by black
+
"B008", # do not perform function calls in argument defaults
+
[tool.ruff.lint.per-file-ignores]
+
"__init__.py" = ["F401"]
+
check_untyped_defs = true
+
disallow_any_generics = true
+
disallow_incomplete_defs = true
+
disallow_untyped_defs = true
+
no_implicit_optional = true
+
warn_redundant_casts = true
+
warn_unused_ignores = true
+
[[tool.mypy.overrides]]
+
ignore_missing_imports = true
+
[tool.pytest.ini_options]
+
python_files = ["test_*.py"]
+
python_classes = ["Test*"]
+
python_functions = ["test_*"]
+
"--cov-report=term-missing",
+
"ignore::DeprecationWarning",
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
+
"integration: marks tests as integration tests",
+
"raise AssertionError",
+
"raise NotImplementedError",
+
"if __name__ == .__main__.:",
+
"class .*\\bProtocol\\):",
+
"@(abc\\.)?abstractmethod",
+
<file path="src/thicket/cli/commands/__init__.py">
+
"""CLI commands for thicket."""
+
# Import all commands to register them with the main app
+
from . import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
+
__all__ = ["add", "duplicates", "generate", "index_cmd", "info_cmd", "init", "links_cmd", "list_cmd", "sync"]
+
<file path="src/thicket/cli/commands/add.py">
+
"""Add command for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from pydantic import HttpUrl, ValidationError
+
from ...core.feed_parser import FeedParser
+
from ...core.git_store import GitStore
+
subcommand: str = typer.Argument(..., help="Subcommand: 'user' or 'feed'"),
+
username: str = typer.Argument(..., help="Username"),
+
feed_url: Optional[str] = typer.Argument(None, help="Feed URL (required for 'user' command)"),
+
email: Optional[str] = typer.Option(None, "--email", "-e", help="User email"),
+
homepage: Optional[str] = typer.Option(None, "--homepage", "-h", help="User homepage"),
+
icon: Optional[str] = typer.Option(None, "--icon", "-i", help="User icon URL"),
+
display_name: Optional[str] = typer.Option(None, "--display-name", "-d", help="User display name"),
+
config_file: Optional[Path] = typer.Option(
+
Path("thicket.yaml"), "--config", help="Configuration file path"
+
auto_discover: bool = typer.Option(
+
True, "--auto-discover/--no-auto-discover", help="Auto-discover user metadata from feed"
+
"""Add a user or feed to thicket."""
+
if subcommand == "user":
+
add_user(username, feed_url, email, homepage, icon, display_name, config_file, auto_discover)
+
elif subcommand == "feed":
+
add_feed(username, feed_url, config_file)
+
print_error(f"Unknown subcommand: {subcommand}")
+
print_error("Use 'user' or 'feed'")
+
feed_url: Optional[str],
+
homepage: Optional[str],
+
display_name: Optional[str],
+
"""Add a new user with feed."""
+
print_error("Feed URL is required when adding a user")
+
validated_feed_url = HttpUrl(feed_url)
+
except ValidationError:
+
print_error(f"Invalid feed URL: {feed_url}")
+
raise typer.Exit(1) from None
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Check if user already exists
+
existing_user = git_store.get_user(username)
+
print_error(f"User '{username}' already exists")
+
print_error("Use 'thicket add feed' to add additional feeds")
+
# Auto-discover metadata if enabled
+
discovered_metadata = None
+
discovered_metadata = asyncio.run(discover_feed_metadata(validated_feed_url))
+
# Prepare user data with manual overrides taking precedence
+
user_display_name = display_name or (discovered_metadata.author_name or discovered_metadata.title if discovered_metadata else None)
+
user_email = email or (discovered_metadata.author_email if discovered_metadata else None)
+
user_homepage = homepage or (str(discovered_metadata.author_uri or discovered_metadata.link) if discovered_metadata else None)
+
user_icon = icon or (str(discovered_metadata.logo or discovered_metadata.icon or discovered_metadata.image_url) if discovered_metadata else None)
+
# Add user to Git store
+
display_name=user_display_name,
+
homepage=user_homepage,
+
feeds=[str(validated_feed_url)],
+
git_store.commit_changes(f"Add user: {username}")
+
print_success(f"Added user '{username}' with feed: {feed_url}")
+
if discovered_metadata and auto_discover:
+
print_info("Auto-discovered metadata:")
+
print_info(f" Display name: {user_display_name}")
+
print_info(f" Email: {user_email}")
+
print_info(f" Homepage: {user_homepage}")
+
print_info(f" Icon: {user_icon}")
+
def add_feed(username: str, feed_url: Optional[str], config_file: Path) -> None:
+
"""Add a feed to an existing user."""
+
print_error("Feed URL is required")
+
validated_feed_url = HttpUrl(feed_url)
+
except ValidationError:
+
print_error(f"Invalid feed URL: {feed_url}")
+
raise typer.Exit(1) from None
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
user = git_store.get_user(username)
+
print_error(f"User '{username}' not found")
+
print_error("Use 'thicket add user' to add a new user")
+
# Check if feed already exists
+
if str(validated_feed_url) in user.feeds:
+
print_error(f"Feed already exists for user '{username}': {feed_url}")
+
updated_feeds = user.feeds + [str(validated_feed_url)]
+
if git_store.update_user(username, feeds=updated_feeds):
+
git_store.commit_changes(f"Add feed to user {username}: {feed_url}")
+
print_success(f"Added feed to user '{username}': {feed_url}")
+
print_error(f"Failed to add feed to user '{username}'")
+
async def discover_feed_metadata(feed_url: HttpUrl):
+
"""Discover metadata from a feed URL."""
+
with create_progress() as progress:
+
task = progress.add_task("Discovering feed metadata...", total=None)
+
content = await parser.fetch_feed(feed_url)
+
metadata, _ = parser.parse_feed(content, feed_url)
+
progress.update(task, completed=True)
+
print_error(f"Failed to discover feed metadata: {e}")
+
<file path="src/thicket/cli/commands/duplicates.py">
+
"""Duplicates command for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.table import Table
+
from ...core.git_store import GitStore
+
@app.command("duplicates")
+
def duplicates_command(
+
action: str = typer.Argument(..., help="Action: 'list', 'add', 'remove'"),
+
duplicate_id: Optional[str] = typer.Argument(None, help="Duplicate entry ID"),
+
canonical_id: Optional[str] = typer.Argument(None, help="Canonical entry ID"),
+
config_file: Optional[Path] = typer.Option(
+
Path("thicket.yaml"), "--config", help="Configuration file path"
+
"""Manage duplicate entry mappings."""
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
list_duplicates(git_store)
+
add_duplicate(git_store, duplicate_id, canonical_id)
+
elif action == "remove":
+
remove_duplicate(git_store, duplicate_id)
+
print_error(f"Unknown action: {action}")
+
print_error("Use 'list', 'add', or 'remove'")
+
def list_duplicates(git_store: GitStore) -> None:
+
"""List all duplicate mappings."""
+
duplicates = git_store.get_duplicates()
+
if not duplicates.duplicates:
+
print("No duplicate mappings found")
+
print_info("No duplicate mappings found")
+
print("Duplicate ID\tCanonical ID")
+
for duplicate_id, canonical_id in duplicates.duplicates.items():
+
print(f"{duplicate_id}\t{canonical_id}")
+
print(f"Total duplicates: {len(duplicates.duplicates)}")
+
table = Table(title="Duplicate Entry Mappings")
+
table.add_column("Duplicate ID", style="red")
+
table.add_column("Canonical ID", style="green")
+
for duplicate_id, canonical_id in duplicates.duplicates.items():
+
table.add_row(duplicate_id, canonical_id)
+
print_info(f"Total duplicates: {len(duplicates.duplicates)}")
+
def add_duplicate(git_store: GitStore, duplicate_id: Optional[str], canonical_id: Optional[str]) -> None:
+
"""Add a duplicate mapping."""
+
print_error("Duplicate ID is required")
+
print_error("Canonical ID is required")
+
# Check if duplicate_id already exists
+
duplicates = git_store.get_duplicates()
+
if duplicates.is_duplicate(duplicate_id):
+
existing_canonical = duplicates.get_canonical(duplicate_id)
+
print_error(f"Duplicate ID already mapped to: {existing_canonical}")
+
print_error("Use 'remove' first to change the mapping")
+
# Check if we're trying to make a canonical ID point to itself
+
if duplicate_id == canonical_id:
+
print_error("Duplicate ID cannot be the same as canonical ID")
+
git_store.add_duplicate(duplicate_id, canonical_id)
+
git_store.commit_changes(f"Add duplicate mapping: {duplicate_id} -> {canonical_id}")
+
print_success(f"Added duplicate mapping: {duplicate_id} -> {canonical_id}")
+
def remove_duplicate(git_store: GitStore, duplicate_id: Optional[str]) -> None:
+
"""Remove a duplicate mapping."""
+
print_error("Duplicate ID is required")
+
# Check if mapping exists
+
duplicates = git_store.get_duplicates()
+
if not duplicates.is_duplicate(duplicate_id):
+
print_error(f"No duplicate mapping found for: {duplicate_id}")
+
canonical_id = duplicates.get_canonical(duplicate_id)
+
if git_store.remove_duplicate(duplicate_id):
+
git_store.commit_changes(f"Remove duplicate mapping: {duplicate_id} -> {canonical_id}")
+
print_success(f"Removed duplicate mapping: {duplicate_id} -> {canonical_id}")
+
print_error(f"Failed to remove duplicate mapping: {duplicate_id}")
+
<file path="src/thicket/cli/commands/sync.py">
+
"""Sync command for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.progress import track
+
from ...core.feed_parser import FeedParser
+
from ...core.git_store import GitStore
+
all_users: bool = typer.Option(
+
False, "--all", "-a", help="Sync all users and feeds"
+
user: Optional[str] = typer.Option(
+
None, "--user", "-u", help="Sync specific user only"
+
config_file: Optional[Path] = typer.Option(
+
Path("thicket.yaml"), "--config", help="Configuration file path"
+
dry_run: bool = typer.Option(
+
False, "--dry-run", help="Show what would be synced without making changes"
+
"""Sync feeds and store entries in Git repository."""
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Determine which users to sync from git repository
+
index = git_store._load_index()
+
users_to_sync = list(index.users.values())
+
user_metadata = git_store.get_user(user)
+
print_error(f"User '{user}' not found in git repository")
+
users_to_sync = [user_metadata]
+
print_error("Specify --all to sync all users or --user to sync a specific user")
+
print_info("No users configured to sync")
+
total_updated_entries = 0
+
for user_metadata in users_to_sync:
+
print_info(f"Syncing user: {user_metadata.username}")
+
user_updated_entries = 0
+
# Sync each feed for the user
+
for feed_url in track(user_metadata.feeds, description=f"Syncing {user_metadata.username}'s feeds"):
+
new_entries, updated_entries = asyncio.run(
+
sync_feed(git_store, user_metadata.username, feed_url, dry_run)
+
user_new_entries += new_entries
+
user_updated_entries += updated_entries
+
print_error(f"Failed to sync feed {feed_url}: {e}")
+
print_info(f"User {user_metadata.username}: {user_new_entries} new, {user_updated_entries} updated")
+
total_new_entries += user_new_entries
+
total_updated_entries += user_updated_entries
+
# Commit changes if not dry run
+
if not dry_run and (total_new_entries > 0 or total_updated_entries > 0):
+
commit_message = f"Sync feeds: {total_new_entries} new entries, {total_updated_entries} updated"
+
git_store.commit_changes(commit_message)
+
print_success(f"Committed changes: {commit_message}")
+
print_info(f"Dry run complete: would sync {total_new_entries} new entries, {total_updated_entries} updated")
+
print_success(f"Sync complete: {total_new_entries} new entries, {total_updated_entries} updated")
+
async def sync_feed(git_store: GitStore, username: str, feed_url, dry_run: bool) -> tuple[int, int]:
+
"""Sync a single feed for a user."""
+
content = await parser.fetch_feed(feed_url)
+
metadata, entries = parser.parse_feed(content, feed_url)
+
# Check if entry already exists
+
existing_entry = git_store.get_entry(username, entry.id)
+
# Check if entry has been updated
+
if existing_entry.updated != entry.updated:
+
git_store.store_entry(username, entry)
+
git_store.store_entry(username, entry)
+
print_error(f"Failed to process entry {entry.id}: {e}")
+
return new_entries, updated_entries
+
print_error(f"Failed to sync feed {feed_url}: {e}")
+
<file path="src/thicket/models/config.py">
+
"""Configuration models for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from pydantic import BaseModel, EmailStr, HttpUrl
+
from pydantic_settings import BaseSettings, SettingsConfigDict
+
class UserConfig(BaseModel):
+
"""Configuration for a single user and their feeds."""
+
email: Optional[EmailStr] = None
+
homepage: Optional[HttpUrl] = None
+
icon: Optional[HttpUrl] = None
+
display_name: Optional[str] = None
+
class ThicketConfig(BaseSettings):
+
"""Main configuration for thicket."""
+
model_config = SettingsConfigDict(
+
yaml_file="thicket.yaml",
+
users: list[UserConfig] = []
+
<file path="src/thicket/cli/commands/links_cmd.py">
+
"""CLI command for extracting and categorizing all outbound links from blog entries."""
+
from pathlib import Path
+
from typing import Dict, List, Optional, Set
+
from urllib.parse import urljoin, urlparse
+
from rich.console import Console
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
+
from rich.table import Table
+
from ...core.git_store import GitStore
+
from ..utils import load_config, get_tsv_mode
+
"""Represents a link found in a blog entry."""
+
def __init__(self, url: str, entry_id: str, username: str):
+
self.entry_id = entry_id
+
self.username = username
+
def to_dict(self) -> dict:
+
"""Convert to dictionary for JSON serialization."""
+
"entry_id": self.entry_id,
+
"username": self.username
+
def from_dict(cls, data: dict) -> "LinkData":
+
"""Create from dictionary."""
+
entry_id=data["entry_id"],
+
username=data["username"]
+
"""Categorizes links as internal, user, or unknown."""
+
def __init__(self, user_domains: Dict[str, Set[str]]):
+
self.user_domains = user_domains
+
# Create reverse mapping of domain -> username
+
self.domain_to_user = {}
+
for username, domains in user_domains.items():
+
self.domain_to_user[domain] = username
+
def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:
+
Categorize a URL as 'internal', 'user', or 'unknown'.
+
Returns (category, target_username).
+
domain = parsed.netloc.lower()
+
# Check if it's a link to the same user's domain (internal)
+
if domain in self.user_domains.get(source_username, set()):
+
return "internal", source_username
+
# Check if it's a link to another user's domain
+
if domain in self.domain_to_user:
+
return "user", self.domain_to_user[domain]
+
# Everything else is unknown
+
"""Extracts and resolves links from blog entries."""
+
# Pattern for extracting links from HTML
+
self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
+
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
+
def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:
+
"""Extract all links from HTML content and resolve them against base URL."""
+
# Extract links from <a> tags
+
for match in self.link_pattern.finditer(html_content):
+
text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
+
# Resolve relative URLs against base URL
+
resolved_url = urljoin(base_url, url)
+
links.append((resolved_url, text))
+
def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:
+
"""Extract all links from a blog entry."""
+
# Combine all text content for analysis
+
content_to_search.append(entry.content)
+
content_to_search.append(entry.summary)
+
for content in content_to_search:
+
extracted_links = self.extract_links_from_html(content, base_url)
+
for url, link_text in extracted_links:
+
if not url or url.startswith('#'):
+
links.append(link_data)
+
config_file: Optional[Path] = typer.Option(
+
help="Path to configuration file",
+
output_file: Optional[Path] = typer.Option(
+
help="Path to output unified links file (default: links.json in git store)",
+
verbose: bool = typer.Option(
+
help="Show detailed progress information",
+
"""Extract and categorize all outbound links from blog entries.
+
This command analyzes all blog entries to extract outbound links,
+
resolve them properly with respect to the feed's base URL, and
+
categorize them as internal, user, or unknown links.
+
Creates a unified links.json file containing all link data.
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Build user domain mapping
+
console.print("Building user domain mapping...")
+
index = git_store._load_index()
+
for username, user_metadata in index.users.items():
+
# Add domains from feeds
+
for feed_url in user_metadata.feeds:
+
domain = urlparse(feed_url).netloc.lower()
+
# Add domain from homepage
+
if user_metadata.homepage:
+
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
+
user_domains[username] = domains
+
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
+
# Initialize components
+
link_extractor = LinkExtractor()
+
categorizer = LinkCategorizer(user_domains)
+
users = list(index.users.keys())
+
console.print("[yellow]No users found in Git store[/yellow]")
+
link_categories = {"internal": [], "user": [], "unknown": []}
+
link_dict = {} # Dictionary with link URL as key, maps to list of atom IDs
+
reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs
+
TextColumn("[progress.description]{task.description}"),
+
# Count total entries first
+
counting_task = progress.add_task("Counting entries...", total=len(users))
+
entries = git_store.list_entries(username)
+
total_entries += len(entries)
+
progress.advance(counting_task)
+
progress.remove_task(counting_task)
+
processing_task = progress.add_task(
+
f"Processing {total_entries} entries...",
+
entries = git_store.list_entries(username)
+
user_metadata = index.users[username]
+
# Get base URL for this user (use first feed URL)
+
base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"
+
# Extract links from this entry
+
entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)
+
# Track unique links per entry
+
entry_urls_seen = set()
+
for link_data in entry_links:
+
# Skip if we've already seen this URL in this entry
+
if link_data.url in entry_urls_seen:
+
entry_urls_seen.add(link_data.url)
+
category, target_username = categorizer.categorize_url(link_data.url, username)
+
# Add to link dictionary (URL as key, maps to list of atom IDs)
+
if link_data.url not in link_dict:
+
link_dict[link_data.url] = []
+
if link_data.entry_id not in link_dict[link_data.url]:
+
link_dict[link_data.url].append(link_data.entry_id)
+
# Also add to reverse mapping (atom ID -> list of URLs)
+
if link_data.entry_id not in reverse_dict:
+
reverse_dict[link_data.entry_id] = []
+
if link_data.url not in reverse_dict[link_data.entry_id]:
+
reverse_dict[link_data.entry_id].append(link_data.url)
+
# Add category info to link data for categories tracking
+
link_info = link_data.to_dict()
+
link_info["category"] = category
+
link_info["target_username"] = target_username
+
all_links.append(link_info)
+
link_categories[category].append(link_info)
+
progress.advance(processing_task)
+
if verbose and entry_links:
+
console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...")
+
# Determine output path
+
output_path = output_file
+
output_path = config.git_store / "links.json"
+
# Save all extracted links (not just filtered ones)
+
console.print("Preparing output data...")
+
# Build a set of all URLs that correspond to posts in the git database
+
registered_urls = set()
+
# Get all entries from all users and build URL mappings
+
entries = git_store.list_entries(username)
+
user_metadata = index.users[username]
+
# Try to match entry URLs with extracted links
+
if hasattr(entry, 'link') and entry.link:
+
registered_urls.add(str(entry.link))
+
# Also check entry alternate links if they exist
+
if hasattr(entry, 'links') and entry.links:
+
for link in entry.links:
+
if hasattr(link, 'href') and link.href:
+
registered_urls.add(str(link.href))
+
# Build unified structure with metadata
+
for url, entry_ids in link_dict.items():
+
"referencing_entries": entry_ids
+
# Find target username if this is a tracked post
+
if url in registered_urls:
+
user_domains_set = {domain for domain in user_domains.get(username, [])}
+
if any(domain in url for domain in user_domains_set):
+
unified_links[url]["target_username"] = username
+
# Build reverse mapping
+
for entry_id in entry_ids:
+
if entry_id not in reverse_mapping:
+
reverse_mapping[entry_id] = []
+
if url not in reverse_mapping[entry_id]:
+
reverse_mapping[entry_id].append(url)
+
# Create unified output data
+
"links": unified_links,
+
"reverse_mapping": reverse_mapping,
+
"user_domains": {k: list(v) for k, v in user_domains.items()}
+
console.print(f"Found {len(registered_urls)} registered post URLs")
+
console.print(f"Found {len(link_dict)} total links, {sum(1 for link in unified_links.values() if 'target_username' in link)} tracked posts")
+
with open(output_path, "w") as f:
+
json.dump(output_data, f, indent=2, default=str)
+
console.print("\n[green]✓ Links extraction completed successfully[/green]")
+
# Create summary table or TSV output
+
print("Category\tCount\tDescription")
+
print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")
+
print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")
+
print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")
+
print(f"Total Extracted\t{len(all_links)}\tAll extracted links")
+
print(f"Saved to Output\t{len(output_data['links'])}\tLinks saved to output file")
+
print(f"Cross-references\t{sum(1 for link in unified_links.values() if 'target_username' in link)}\tLinks to registered posts only")
+
table = Table(title="Links Summary")
+
table.add_column("Category", style="cyan")
+
table.add_column("Count", style="green")
+
table.add_column("Description", style="white")
+
table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")
+
table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")
+
table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")
+
table.add_row("Total Extracted", str(len(all_links)), "All extracted links")
+
table.add_row("Saved to Output", str(len(output_data['links'])), "Links saved to output file")
+
table.add_row("Cross-references", str(sum(1 for link in unified_links.values() if 'target_username' in link)), "Links to registered posts only")
+
# Show user links if verbose
+
if verbose and link_categories["user"]:
+
print("User Link Source\tUser Link Target\tLink Count")
+
for link in link_categories["user"]:
+
key = f"{link['username']} -> {link['target_username']}"
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
+
source, target = link_pair.split(" -> ")
+
print(f"{source}\t{target}\t{count}")
+
console.print("\n[bold]User-to-user links:[/bold]")
+
for link in link_categories["user"]:
+
key = f"{link['username']} -> {link['target_username']}"
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
+
console.print(f" {link_pair}: {count} links")
+
console.print(f"\nUnified links data saved to: {output_path}")
+
console.print(f"[red]Error extracting links: {e}[/red]")
+
console.print_exception()
+
<file path="src/thicket/cli/commands/list_cmd.py">
+
"""List command for thicket."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.table import Table
+
from ...core.git_store import GitStore
+
print_feeds_table_from_git,
+
print_users_table_from_git,
+
what: str = typer.Argument(..., help="What to list: 'users', 'feeds', 'entries'"),
+
user: Optional[str] = typer.Option(
+
None, "--user", "-u", help="Filter by specific user"
+
limit: Optional[int] = typer.Option(
+
None, "--limit", "-l", help="Limit number of results"
+
config_file: Optional[Path] = typer.Option(
+
Path("thicket.yaml"), "--config", help="Configuration file path"
+
"""List users, feeds, or entries."""
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
list_feeds(git_store, user)
+
elif what == "entries":
+
list_entries(git_store, user, limit)
+
print_error(f"Unknown list type: {what}")
+
print_error("Use 'users', 'feeds', or 'entries'")
+
def list_users(git_store: GitStore) -> None:
+
index = git_store._load_index()
+
users = list(index.users.values())
+
print_info("No users configured")
+
print_users_table_from_git(users)
+
def list_feeds(git_store: GitStore, username: Optional[str] = None) -> None:
+
"""List feeds, optionally filtered by user."""
+
user = git_store.get_user(username)
+
print_error(f"User '{username}' not found")
+
print_info(f"No feeds configured for user '{username}'")
+
print_feeds_table_from_git(git_store, username)
+
def list_entries(git_store: GitStore, username: Optional[str] = None, limit: Optional[int] = None) -> None:
+
"""List entries, optionally filtered by user."""
+
# List entries for specific user
+
user = git_store.get_user(username)
+
print_error(f"User '{username}' not found")
+
entries = git_store.list_entries(username, limit)
+
print_info(f"No entries found for user '{username}'")
+
print_entries_table([entries], [username])
+
# List entries for all users
+
index = git_store._load_index()
+
for user in index.users.values():
+
entries = git_store.list_entries(user.username, limit)
+
all_entries.append(entries)
+
all_usernames.append(user.username)
+
print_info("No entries found")
+
print_entries_table(all_entries, all_usernames)
+
def _clean_html_content(content: Optional[str]) -> str:
+
"""Clean HTML content for display in table."""
+
clean_text = re.sub(r'<[^>]+>', ' ', content)
+
# Replace multiple whitespace with single space
+
clean_text = re.sub(r'\s+', ' ', clean_text)
+
# Strip and limit length
+
clean_text = clean_text.strip()
+
if len(clean_text) > 100:
+
clean_text = clean_text[:97] + "..."
+
def print_entries_table(entries_by_user: list[list], usernames: list[str]) -> None:
+
"""Print a table of entries."""
+
print_entries_tsv(entries_by_user, usernames)
+
table = Table(title="Feed Entries")
+
table.add_column("User", style="cyan", no_wrap=True)
+
table.add_column("Title", style="bold")
+
table.add_column("Updated", style="blue")
+
table.add_column("URL", style="green")
+
# Combine all entries with usernames
+
for entries, username in zip(entries_by_user, usernames):
+
all_entries.append((username, entry))
+
# Sort by updated time (newest first)
+
all_entries.sort(key=lambda x: x[1].updated, reverse=True)
+
for username, entry in all_entries:
+
updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
+
# Truncate title if too long
+
title = title[:47] + "..."
+
<file path="src/thicket/cli/main.py">
+
"""Main CLI application using Typer."""
+
from rich.console import Console
+
from .. import __version__
+
help="A CLI tool for persisting Atom/RSS feeds in Git repositories",
+
rich_markup_mode="rich",
+
# Global state for TSV output mode
+
def version_callback(value: bool) -> None:
+
"""Show version and exit."""
+
console.print(f"thicket version {__version__}")
+
version: bool = typer.Option(
+
help="Show the version and exit",
+
callback=version_callback,
+
tsv: bool = typer.Option(
+
help="Output in tab-separated values format without truncation",
+
"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
+
# Import commands to register them
+
from .commands import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
+
if __name__ == "__main__":
+
<file path="src/thicket/core/git_store.py">
+
"""Git repository operations for thicket."""
+
from datetime import datetime
+
from pathlib import Path
+
from typing import Optional
+
from ..models import AtomEntry, DuplicateMap, GitStoreIndex, UserMetadata
+
"""Manages the Git repository for storing feed entries."""
+
def __init__(self, repo_path: Path):
+
"""Initialize the Git store."""
+
self.repo_path = repo_path
+
self.repo: Optional[Repo] = None
+
def _ensure_repo(self) -> None:
+
"""Ensure the Git repository exists and is initialized."""
+
if not self.repo_path.exists():
+
self.repo_path.mkdir(parents=True, exist_ok=True)
+
self.repo = Repo(self.repo_path)
+
except git.InvalidGitRepositoryError:
+
# Initialize new repository
+
self.repo = Repo.init(self.repo_path)
+
self._create_initial_structure()
+
def _create_initial_structure(self) -> None:
+
"""Create initial Git store structure."""
+
created=datetime.now(),
+
last_updated=datetime.now(),
+
self._save_index(index)
+
# Create duplicates.json
+
duplicates = DuplicateMap()
+
self._save_duplicates(duplicates)
+
# Create initial commit
+
self.repo.index.add(["index.json", "duplicates.json"])
+
self.repo.index.commit("Initial thicket repository structure")
+
def _save_index(self, index: GitStoreIndex) -> None:
+
"""Save the index to index.json."""
+
index_path = self.repo_path / "index.json"
+
with open(index_path, "w") as f:
+
json.dump(index.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
+
def _load_index(self) -> GitStoreIndex:
+
"""Load the index from index.json."""
+
index_path = self.repo_path / "index.json"
+
if not index_path.exists():
+
created=datetime.now(),
+
last_updated=datetime.now(),
+
with open(index_path) as f:
+
return GitStoreIndex(**data)
+
def _save_duplicates(self, duplicates: DuplicateMap) -> None:
+
"""Save duplicates map to duplicates.json."""
+
duplicates_path = self.repo_path / "duplicates.json"
+
with open(duplicates_path, "w") as f:
+
json.dump(duplicates.model_dump(exclude_none=True), f, indent=2)
+
def _load_duplicates(self) -> DuplicateMap:
+
"""Load duplicates map from duplicates.json."""
+
duplicates_path = self.repo_path / "duplicates.json"
+
if not duplicates_path.exists():
+
with open(duplicates_path) as f:
+
return DuplicateMap(**data)
+
def add_user(self, username: str, display_name: Optional[str] = None,
+
email: Optional[str] = None, homepage: Optional[str] = None,
+
icon: Optional[str] = None, feeds: Optional[list[str]] = None) -> UserMetadata:
+
"""Add a new user to the Git store."""
+
index = self._load_index()
+
# Create user directory
+
user_dir = self.repo_path / username
+
user_dir.mkdir(exist_ok=True)
+
user_metadata = UserMetadata(
+
display_name=display_name,
+
created=datetime.now(),
+
last_updated=datetime.now(),
+
index.add_user(user_metadata)
+
self._save_index(index)
+
def get_user(self, username: str) -> Optional[UserMetadata]:
+
"""Get user metadata by username."""
+
index = self._load_index()
+
return index.get_user(username)
+
def update_user(self, username: str, **kwargs) -> bool:
+
"""Update user metadata."""
+
index = self._load_index()
+
user = index.get_user(username)
+
for key, value in kwargs.items():
+
if hasattr(user, key) and value is not None:
+
setattr(user, key, value)
+
user.update_timestamp()
+
self._save_index(index)
+
def store_entry(self, username: str, entry: AtomEntry) -> bool:
+
"""Store an entry in the user's directory."""
+
user = self.get_user(username)
+
# Sanitize entry ID for filename
+
from .feed_parser import FeedParser
+
safe_id = parser.sanitize_entry_id(entry.id)
+
user_dir = self.repo_path / user.directory
+
entry_path = user_dir / f"{safe_id}.json"
+
# Check if entry already exists
+
entry_exists = entry_path.exists()
+
with open(entry_path, "w") as f:
+
json.dump(entry.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
+
# Update user metadata if new entry
+
index = self._load_index()
+
index.update_entry_count(username, 1)
+
self._save_index(index)
+
def get_entry(self, username: str, entry_id: str) -> Optional[AtomEntry]:
+
"""Get an entry by username and entry ID."""
+
user = self.get_user(username)
+
from .feed_parser import FeedParser
+
safe_id = parser.sanitize_entry_id(entry_id)
+
entry_path = self.repo_path / user.directory / f"{safe_id}.json"
+
if not entry_path.exists():
+
with open(entry_path) as f:
+
return AtomEntry(**data)
+
def list_entries(self, username: str, limit: Optional[int] = None) -> list[AtomEntry]:
+
"""List entries for a user."""
+
user = self.get_user(username)
+
user_dir = self.repo_path / user.directory
+
if not user_dir.exists():
+
entry_files = sorted(user_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
+
entry_files = entry_files[:limit]
+
for entry_file in entry_files:
+
with open(entry_file) as f:
+
entries.append(AtomEntry(**data))
+
def get_duplicates(self) -> DuplicateMap:
+
"""Get the duplicates map."""
+
return self._load_duplicates()
+
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
+
"""Add a duplicate mapping."""
+
duplicates = self._load_duplicates()
+
duplicates.add_duplicate(duplicate_id, canonical_id)
+
self._save_duplicates(duplicates)
+
def remove_duplicate(self, duplicate_id: str) -> bool:
+
"""Remove a duplicate mapping."""
+
duplicates = self._load_duplicates()
+
result = duplicates.remove_duplicate(duplicate_id)
+
self._save_duplicates(duplicates)
+
def commit_changes(self, message: str) -> None:
+
"""Commit all changes to the Git repository."""
+
self.repo.git.add(A=True)
+
# Check if there are changes to commit
+
if self.repo.index.diff("HEAD"):
+
self.repo.index.commit(message)
+
def get_stats(self) -> dict:
+
"""Get statistics about the Git store."""
+
index = self._load_index()
+
duplicates = self._load_duplicates()
+
"total_users": len(index.users),
+
"total_entries": index.total_entries,
+
"total_duplicates": len(duplicates.duplicates),
+
"last_updated": index.last_updated,
+
"repository_size": sum(f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()),
+
def search_entries(self, query: str, username: Optional[str] = None,
+
limit: Optional[int] = None) -> list[tuple[str, AtomEntry]]:
+
"""Search entries by content."""
+
index = self._load_index()
+
users = [index.get_user(username)] if username else list(index.users.values())
+
users = [u for u in users if u is not None]
+
user_dir = self.repo_path / user.directory
+
if not user_dir.exists():
+
entry_files = user_dir.glob("*.json")
+
for entry_file in entry_files:
+
with open(entry_file) as f:
+
entry = AtomEntry(**data)
+
# Simple text search in title, summary, and content
+
searchable_text = " ".join(filter(None, [
+
if query.lower() in searchable_text:
+
results.append((user.username, entry))
+
if limit and len(results) >= limit:
+
# Sort by updated time (newest first)
+
results.sort(key=lambda x: x[1].updated, reverse=True)
+
return results[:limit] if limit else results
+
# Thicket Architecture Design
+
Thicket is a modern CLI tool for persisting Atom/RSS feeds in a Git repository, designed to enable distributed webblog comment structures.
+
- **Typer** (0.15.x) - Modern CLI framework with type hints
+
- **Rich** (13.x) - Beautiful terminal output, progress bars, and tables
+
- **prompt-toolkit** - Interactive prompts when needed
+
- **feedparser** (6.0.11) - Universal feed parser supporting RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0
+
- Alternative: **atoma** for stricter Atom/RSS parsing with JSON feed support
+
- Alternative: **fastfeedparser** for high-performance parsing (10x faster)
+
- **GitPython** (3.1.44) - High-level git operations, requires git CLI
+
- Alternative: **pygit2** (1.18.0) - Direct libgit2 bindings, better for authentication
+
- **httpx** (0.28.x) - Modern async/sync HTTP client with connection pooling
+
- **aiohttp** (3.11.x) - For async-only operations if needed
+
#### Configuration & Data Models
+
- **pydantic** (2.11.x) - Data validation and settings management
+
- **pydantic-settings** (2.10.x) - Configuration file handling with env var support
+
- **pendulum** (3.x) - Better datetime handling
+
- **bleach** (6.x) - HTML sanitization for feed content
+
- **platformdirs** (4.x) - Cross-platform directory paths
+
├── pyproject.toml # Modern Python packaging
+
├── README.md # Project documentation
+
├── ARCH.md # This file
+
├── CLAUDE.md # Project instructions
+
│ ├── __main__.py # Entry point for `python -m thicket`
+
│ ├── cli/ # CLI commands and interface
+
│ │ ├── main.py # Main CLI app with Typer
+
│ │ ├── commands/ # Subcommands
+
│ │ │ ├── init.py # Initialize git store
+
│ │ │ ├── add.py # Add users and feeds
+
│ │ │ ├── sync.py # Sync feeds
+
│ │ │ ├── list_cmd.py # List users/feeds
+
│ │ │ ├── duplicates.py # Manage duplicate entries
+
│ │ │ ├── links_cmd.py # Extract and categorize links
+
│ │ │ └── index_cmd.py # Build reference index and show threads
+
│ │ └── utils.py # CLI utilities (progress, formatting)
+
│ ├── core/ # Core business logic
+
│ │ ├── feed_parser.py # Feed parsing and normalization
+
│ │ ├── git_store.py # Git repository operations
+
│ │ └── reference_parser.py # Link extraction and threading
+
│ ├── models/ # Pydantic data models
+
│ │ ├── config.py # Configuration models
+
│ │ ├── feed.py # Feed/Entry models
+
│ │ └── user.py # User metadata models
+
│ └── utils/ # Shared utilities
+
│ ├── conftest.py # pytest configuration
+
│ ├── test_feed_parser.py
+
│ ├── test_git_store.py
+
│ └── fixtures/ # Test data
+
└── examples/ # Example configurations
+
### Configuration File (YAML/TOML)
+
class ThicketConfig(BaseSettings):
+
git_store: Path # Git repository location
+
cache_dir: Path # Cache directory
+
users: list[UserConfig]
+
model_config = SettingsConfigDict(
+
yaml_file="thicket.yaml"
+
class UserConfig(BaseModel):
+
email: Optional[EmailStr] = None
+
homepage: Optional[HttpUrl] = None
+
icon: Optional[HttpUrl] = None
+
display_name: Optional[str] = None
+
### Feed Storage Format
+
class AtomEntry(BaseModel):
+
id: str # Original Atom ID
+
published: Optional[datetime]
+
content: Optional[str] # Full body content from Atom entry
+
content_type: Optional[str] = "html" # text, html, xhtml
+
categories: list[str] = []
+
rights: Optional[str] = None # Copyright info
+
source: Optional[str] = None # Source feed URL
+
# Additional Atom fields preserved during RSS->Atom conversion
+
model_config = ConfigDict(
+
datetime: lambda v: v.isoformat()
+
class DuplicateMap(BaseModel):
+
"""Maps duplicate entry IDs to canonical entry IDs"""
+
duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
+
comment: str = "Entry IDs that map to the same canonical content"
+
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
+
"""Add a duplicate mapping"""
+
self.duplicates[duplicate_id] = canonical_id
+
def remove_duplicate(self, duplicate_id: str) -> bool:
+
"""Remove a duplicate mapping. Returns True if existed."""
+
return self.duplicates.pop(duplicate_id, None) is not None
+
def get_canonical(self, entry_id: str) -> str:
+
"""Get canonical ID for an entry (returns original if not duplicate)"""
+
return self.duplicates.get(entry_id, entry_id)
+
def is_duplicate(self, entry_id: str) -> bool:
+
"""Check if entry ID is marked as duplicate"""
+
return entry_id in self.duplicates
+
## Git Repository Structure
+
├── index.json # User directory index
+
├── duplicates.json # Manual curation of duplicate entries
+
├── links.json # Unified links, references, and mapping data
+
│ ├── entry_id_1.json # Sanitized entry files
+
## Key Design Decisions
+
### 1. Feed Normalization & Auto-Discovery
+
- All RSS feeds converted to Atom format before storage
+
- Preserves maximum metadata during conversion
+
- Sanitizes HTML content to prevent XSS
+
- **Auto-discovery**: Extracts user metadata from feed during `add user` command
+
- Consistent algorithm to convert Atom IDs to safe filenames
+
- Handles edge cases (very long IDs, special characters)
+
- Maintains reversibility where possible
+
- Uses GitPython for simplicity (no authentication required)
+
- Single main branch for all users and entries
+
- Atomic commits per sync operation
+
- Meaningful commit messages with feed update summaries
+
- Preserves complete history - never delete entries even if they disappear from feeds
+
### 4. Caching Strategy
+
- HTTP caching with Last-Modified/ETag support
+
- Local cache of parsed feeds with TTL
+
- Cache invalidation on configuration changes
+
- Git store serves as permanent historical archive beyond feed depth limits
+
- Graceful handling of feed parsing errors
+
- Retry logic for network failures
+
- Clear error messages with recovery suggestions
+
## CLI Command Structure
+
# Initialize a new git store
+
thicket init /path/to/store
+
# Add a user with feeds (auto-discovers metadata from feed)
+
thicket add user "alyssa" \
+
--feed "https://example.com/feed.atom"
+
# Auto-populates: email, homepage, icon, display_name from feed metadata
+
# Add a user with manual overrides
+
thicket add user "alyssa" \
+
--feed "https://example.com/feed.atom" \
+
--email "alyssa@example.com" \
+
--homepage "https://alyssa.example.com" \
+
--icon "https://example.com/avatar.png" \
+
--display-name "Alyssa P. Hacker"
+
# Add additional feed to existing user
+
thicket add feed "alyssa" "https://example.com/other-feed.rss"
+
# Sync all feeds (designed for cron usage)
+
thicket sync --user alyssa
+
# List users and their feeds
+
thicket list feeds --user alyssa
+
# Manage duplicate entries
+
thicket duplicates list
+
thicket duplicates add <entry_id_1> <entry_id_2> # Mark as duplicates
+
thicket duplicates remove <entry_id_1> <entry_id_2> # Unmark duplicates
+
# Link processing and threading
+
thicket links --verbose # Extract and categorize all links
+
thicket index --verbose # Build reference index for threading
+
thicket threads # Show conversation threads
+
thicket threads --username user1 # Show threads for specific user
+
thicket threads --min-size 3 # Show threads with minimum size
+
## Performance Considerations
+
1. **Concurrent Feed Fetching**: Use httpx with asyncio for parallel downloads
+
2. **Incremental Updates**: Only fetch/parse feeds that have changed
+
3. **Efficient Git Operations**: Batch commits, use shallow clones where appropriate
+
4. **Progress Feedback**: Rich progress bars for long operations
+
## Security Considerations
+
1. **HTML Sanitization**: Use bleach to clean feed content
+
2. **URL Validation**: Strict validation of feed URLs
+
3. **Git Security**: No credentials stored in repository
+
4. **Path Traversal**: Careful sanitization of filenames
+
1. **Web Interface**: Optional web UI for browsing the git store
+
2. **Webhooks**: Notify external services on feed updates
+
3. **Feed Discovery**: Auto-discover feeds from HTML pages
+
4. **Export Formats**: Generate static sites, OPML exports
+
5. **Federation**: P2P sync between thicket instances
+
## Requirements Clarification
+
**✓ Resolved Requirements:**
+
1. **Feed Update Frequency**: Designed for cron usage - no built-in scheduling needed
+
2. **Duplicate Handling**: Manual curation via `duplicates.json` file with CLI commands
+
3. **Git Branching**: Single main branch for all users and entries
+
4. **Authentication**: No feeds require authentication currently
+
5. **Content Storage**: Store complete Atom entry body content as provided
+
6. **Deleted Entries**: Preserve all entries in Git store permanently (historical archive)
+
7. **History Depth**: Git store maintains full history beyond feed depth limits
+
8. **Feed Auto-Discovery**: Extract user metadata from feed during `add user` command
+
## Duplicate Entry Management
+
### Duplicate Detection Strategy
+
- **Manual Curation**: Duplicates identified and managed manually via CLI
+
- **Storage**: `duplicates.json` file in Git root maps entry IDs to canonical entries
+
- **Structure**: `{"duplicate_id": "canonical_id", ...}`
+
- **CLI Commands**: Add/remove duplicate mappings with validation
+
- **Query Resolution**: Search/list commands resolve duplicates to canonical entries
+
### Duplicate File Format
+
"https://example.com/feed/entry/123": "https://canonical.com/posts/same-post",
+
"https://mirror.com/articles/456": "https://canonical.com/posts/same-post",
+
"comment": "Entry IDs that map to the same canonical content"
+
## Feed Metadata Auto-Discovery
+
### Extraction Strategy
+
When adding a new user with `thicket add user`, the system fetches and parses the feed to extract:
+
- **Display Name**: From `feed.title` or `feed.author.name`
+
- **Email**: From `feed.author.email` or `feed.managingEditor`
+
- **Homepage**: From `feed.link` or `feed.author.uri`
+
- **Icon**: From `feed.logo`, `feed.icon`, or `feed.image.url`
+
### Discovery Priority Order
+
1. **Author Information**: Prefer `feed.author.*` fields (more specific to person)
+
2. **Feed-Level**: Fall back to feed-level metadata
+
3. **Manual Override**: CLI flags always take precedence over discovered values
+
4. **Update Behavior**: Auto-discovery only runs during initial `add user`, not on sync
+
### Extracted Metadata Format
+
class FeedMetadata(BaseModel):
+
title: Optional[str] = None
+
author_name: Optional[str] = None
+
author_email: Optional[EmailStr] = None
+
author_uri: Optional[HttpUrl] = None
+
link: Optional[HttpUrl] = None
+
logo: Optional[HttpUrl] = None
+
icon: Optional[HttpUrl] = None
+
image_url: Optional[HttpUrl] = None
+
def to_user_config(self, username: str, feed_url: HttpUrl) -> UserConfig:
+
"""Convert discovered metadata to UserConfig with fallbacks"""
+
display_name=self.author_name or self.title,
+
email=self.author_email,
+
homepage=self.author_uri or self.link,
+
icon=self.logo or self.icon or self.image_url
+
## Link Processing and Threading Architecture
+
The thicket system implements a sophisticated link processing and threading system to create email-style threaded views of blog entries by tracking cross-references between different blogs.
+
### Link Processing Pipeline
+
#### 1. Link Extraction (`thicket links`)
+
The `links` command systematically extracts all outbound links from blog entries and categorizes them:
+
class LinkData(BaseModel):
+
url: str # Fully resolved URL
+
entry_id: str # Source entry ID
+
username: str # Source username
+
context: str # Surrounding text context
+
category: str # "internal", "user", or "unknown"
+
target_username: Optional[str] # Target user if applicable
+
- **Internal**: Links to the same user's domain (self-references)
+
- **User**: Links to other tracked users' domains
+
- **Unknown**: Links to external sites not tracked by thicket
+
All links are properly resolved using the Atom feed's base URL to handle:
+
- Relative URLs (converted to absolute)
+
- Protocol-relative URLs
+
- Redirects and canonical URLs
+
The system builds a comprehensive domain mapping from user configuration:
+
- Feed URLs → domain extraction
+
- Homepage URLs → domain extraction
+
- Reverse mapping: domain → username
+
#### 1. Reference Index Generation (`thicket index`)
+
Creates a bidirectional reference index from the categorized links:
+
class BlogReference(BaseModel):
+
target_username: Optional[str]
+
target_entry_id: Optional[str]
+
#### 2. Thread Detection Algorithm
+
Uses graph traversal to find connected blog entries:
+
- **Outbound references**: Links from an entry to other entries
+
- **Inbound references**: Links to an entry from other entries
+
- **Thread members**: All entries connected through references
+
#### 3. Threading Display (`thicket threads`)
+
Creates email-style threaded views:
+
- Chronological ordering within threads
+
- Reference counts (outbound/inbound)
+
- Filtering options (user, entry, minimum size)
+
#### links.json Format (Unified Structure)
+
"https://example.com/post/123": {
+
"referencing_entries": ["https://blog.user.com/entry/456"],
+
"target_username": "user2"
+
"https://external-site.com/article": {
+
"referencing_entries": ["https://blog.user.com/entry/789"]
+
"https://blog.user.com/entry/456": ["https://example.com/post/123"],
+
"https://blog.user.com/entry/789": ["https://external-site.com/article"]
+
"source_entry_id": "https://blog.user.com/entry/456",
+
"source_username": "user1",
+
"target_url": "https://example.com/post/123",
+
"target_username": "user2",
+
"target_entry_id": "https://example.com/post/123",
+
"context": "As mentioned in this post..."
+
"user1": ["blog.user.com"],
+
"user2": ["example.com"]
+
This unified structure eliminates duplication by:
+
- Storing each URL only once with minimal metadata
+
- Including all link data, reference data, and mappings in one file
+
- Using presence of `target_username` to identify tracked vs external links
+
- Providing bidirectional mappings for efficient queries
+
### Unified Structure Benefits
+
- **Eliminates Duplication**: Each URL appears only once with metadata
+
- **Single Source of Truth**: All link-related data in one file
+
- **Efficient Queries**: Fast lookups for both directions (URL→entries, entry→URLs)
+
- **Atomic Updates**: All link data changes together
+
- **Reduced I/O**: Fewer file operations
+
### Implementation Benefits
+
1. **Systematic Link Processing**: All links are extracted and categorized consistently
+
2. **Proper URL Resolution**: Handles relative URLs and base URL resolution correctly
+
3. **Domain-based Categorization**: Automatically identifies user-to-user references
+
4. **Bidirectional Indexing**: Supports both "who links to whom" and "who is linked by whom"
+
5. **Thread Discovery**: Finds conversation threads automatically
+
6. **Rich Context**: Preserves surrounding text for each link
+
7. **Performance**: Pre-computed indexes for fast threading queries
+
# Extract and categorize all links
+
thicket links --verbose
+
# Build reference index for threading
+
thicket index --verbose
+
# Show all conversation threads
+
# Show threads for specific user
+
thicket threads --username user1
+
# Show threads with minimum size
+
thicket threads --min-size 3
+
### Integration with Existing Commands
+
The link processing system integrates seamlessly with existing thicket commands:
+
- `thicket sync` updates entries, requiring `thicket links` to be run afterward
+
- `thicket index` uses the output from `thicket links` for improved accuracy
+
- `thicket threads` provides the user-facing threading interface
+
## Current Implementation Status
+
### ✅ Completed Features
+
1. **Core Infrastructure**
+
- Modern CLI with Typer and Rich
+
- Pydantic data models for type safety
+
- Git repository operations with GitPython
+
- Feed parsing and normalization with feedparser
+
2. **User and Feed Management**
+
- `thicket init` - Initialize git store
+
- `thicket add` - Add users and feeds with auto-discovery
+
- `thicket sync` - Sync feeds with progress tracking
+
- `thicket list` - List users, feeds, and entries
+
- `thicket duplicates` - Manage duplicate entries
+
3. **Link Processing and Threading**
+
- `thicket links` - Extract and categorize all outbound links
+
- `thicket index` - Build reference index from links
+
- `thicket threads` - Display threaded conversation views
+
- Proper URL resolution with base URL handling
+
- Domain-based link categorization
+
- Context preservation for links
+
### 📊 System Performance
+
- **Link Extraction**: Successfully processes thousands of blog entries
+
- **Categorization**: Identifies internal, user, and unknown links
+
- **Threading**: Creates email-style threaded views of conversations
+
- **Storage**: Efficient JSON-based data structures for links and references
+
### 🔧 Current Architecture Highlights
+
- **Modular Design**: Clear separation between CLI, core logic, and models
+
- **Type Safety**: Comprehensive Pydantic models for data validation
+
- **Rich CLI**: Beautiful progress bars, tables, and error handling
+
- **Extensible**: Easy to add new commands and features
+
- **Git Integration**: All data stored in version-controlled JSON files
+
### 🎯 Proven Functionality
+
The system has been tested with real blog data and successfully:
+
- Extracted 14,396 total links from blog entries
+
- Categorized 3,994 internal links, 363 user-to-user links, and 10,039 unknown links
+
- Built comprehensive domain mappings for 16 users across 20 domains
+
- Generated threaded views showing blog conversation patterns
+
The thicket system is now fully functional for:
+
- Maintaining Git repositories of blog feeds
+
- Tracking cross-references between blogs
+
- Creating threaded views of blog conversations
+
- Discovering blog interaction patterns
+
- Building distributed comment systems
+
<file path="src/thicket/cli/utils.py">
+
"""CLI utilities and helpers."""
+
from pathlib import Path
+
from typing import Optional
+
from rich.console import Console
+
from rich.progress import Progress, SpinnerColumn, TextColumn
+
from rich.table import Table
+
from ..models import ThicketConfig, UserMetadata
+
from ..core.git_store import GitStore
+
def get_tsv_mode() -> bool:
+
"""Get the global TSV mode setting."""
+
from .main import tsv_mode
+
def load_config(config_path: Optional[Path] = None) -> ThicketConfig:
+
"""Load thicket configuration from file or environment."""
+
if config_path and config_path.exists():
+
with open(config_path) as f:
+
config_data = yaml.safe_load(f)
+
# Convert to ThicketConfig
+
return ThicketConfig(**config_data)
+
# Try to load from default locations or environment
+
# First try to find thicket.yaml in current directory
+
default_config = Path("thicket.yaml")
+
if default_config.exists():
+
with open(default_config) as f:
+
config_data = yaml.safe_load(f)
+
return ThicketConfig(**config_data)
+
# Fall back to environment variables
+
console.print(f"[red]Error loading configuration: {e}[/red]")
+
console.print("[yellow]Run 'thicket init' to create a new configuration.[/yellow]")
+
raise typer.Exit(1) from e
+
def save_config(config: ThicketConfig, config_path: Path) -> None:
+
"""Save thicket configuration to file."""
+
config_data = config.model_dump(mode="json", exclude_none=True)
+
# Convert Path objects to strings for YAML serialization
+
config_data["git_store"] = str(config_data["git_store"])
+
config_data["cache_dir"] = str(config_data["cache_dir"])
+
with open(config_path, "w") as f:
+
yaml.dump(config_data, f, default_flow_style=False, sort_keys=False)
+
def create_progress() -> Progress:
+
"""Create a Rich progress display."""
+
TextColumn("[progress.description]{task.description}"),
+
def print_users_table(config: ThicketConfig) -> None:
+
"""Print a table of users and their feeds."""
+
print_users_tsv(config)
+
table = Table(title="Users and Feeds")
+
table.add_column("Username", style="cyan", no_wrap=True)
+
table.add_column("Display Name", style="magenta")
+
table.add_column("Email", style="blue")
+
table.add_column("Homepage", style="green")
+
table.add_column("Feeds", style="yellow")
+
for user in config.users:
+
feeds_str = "\n".join(str(feed) for feed in user.feeds)
+
user.display_name or "",
+
str(user.homepage) if user.homepage else "",
+
def print_feeds_table(config: ThicketConfig, username: Optional[str] = None) -> None:
+
"""Print a table of feeds, optionally filtered by username."""
+
print_feeds_tsv(config, username)
+
table = Table(title=f"Feeds{f' for {username}' if username else ''}")
+
table.add_column("Username", style="cyan", no_wrap=True)
+
table.add_column("Feed URL", style="blue")
+
table.add_column("Status", style="green")
+
users = [config.find_user(username)] if username else config.users
+
users = [u for u in users if u is not None]
+
for feed in user.feeds:
+
"Active", # TODO: Add actual status checking
+
def confirm_action(message: str, default: bool = False) -> bool:
+
"""Prompt for confirmation."""
+
return typer.confirm(message, default=default)
+
def print_success(message: str) -> None:
+
"""Print a success message."""
+
console.print(f"[green]✓[/green] {message}")
+
def print_error(message: str) -> None:
+
"""Print an error message."""
+
console.print(f"[red]✗[/red] {message}")
+
def print_warning(message: str) -> None:
+
"""Print a warning message."""
+
console.print(f"[yellow]⚠[/yellow] {message}")
+
def print_info(message: str) -> None:
+
"""Print an info message."""
+
console.print(f"[blue]ℹ[/blue] {message}")
+
def print_users_table_from_git(users: list[UserMetadata]) -> None:
+
"""Print a table of users from git repository."""
+
print_users_tsv_from_git(users)
+
table = Table(title="Users and Feeds")
+
table.add_column("Username", style="cyan", no_wrap=True)
+
table.add_column("Display Name", style="magenta")
+
table.add_column("Email", style="blue")
+
table.add_column("Homepage", style="green")
+
table.add_column("Feeds", style="yellow")
+
feeds_str = "\n".join(user.feeds)
+
user.display_name or "",
+
def print_feeds_table_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
+
"""Print a table of feeds from git repository."""
+
print_feeds_tsv_from_git(git_store, username)
+
table = Table(title=f"Feeds{f' for {username}' if username else ''}")
+
table.add_column("Username", style="cyan", no_wrap=True)
+
table.add_column("Feed URL", style="blue")
+
table.add_column("Status", style="green")
+
user = git_store.get_user(username)
+
users = [user] if user else []
+
index = git_store._load_index()
+
users = list(index.users.values())
+
for feed in user.feeds:
+
"Active", # TODO: Add actual status checking
+
def print_users_tsv(config: ThicketConfig) -> None:
+
"""Print users in TSV format."""
+
print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
+
for user in config.users:
+
feeds_str = ",".join(str(feed) for feed in user.feeds)
+
print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
+
def print_users_tsv_from_git(users: list[UserMetadata]) -> None:
+
"""Print users from git repository in TSV format."""
+
print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
+
feeds_str = ",".join(user.feeds)
+
print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
+
def print_feeds_tsv(config: ThicketConfig, username: Optional[str] = None) -> None:
+
"""Print feeds in TSV format."""
+
print("Username\tFeed URL\tStatus")
+
users = [config.find_user(username)] if username else config.users
+
users = [u for u in users if u is not None]
+
for feed in user.feeds:
+
print(f"{user.username}\t{feed}\tActive")
+
def print_feeds_tsv_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
+
"""Print feeds from git repository in TSV format."""
+
print("Username\tFeed URL\tStatus")
+
user = git_store.get_user(username)
+
users = [user] if user else []
+
index = git_store._load_index()
+
users = list(index.users.values())
+
for feed in user.feeds:
+
print(f"{user.username}\t{feed}\tActive")
+
def print_entries_tsv(entries_by_user: list[list], usernames: list[str]) -> None:
+
"""Print entries in TSV format."""
+
print("User\tAtom ID\tTitle\tUpdated\tURL")
+
# Combine all entries with usernames
+
for entries, username in zip(entries_by_user, usernames):
+
all_entries.append((username, entry))
+
# Sort by updated time (newest first)
+
all_entries.sort(key=lambda x: x[1].updated, reverse=True)
+
for username, entry in all_entries:
+
updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
+
# Escape tabs and newlines in title to preserve TSV format
+
title = entry.title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
+
print(f"{username}\t{entry.id}\t{title}\t{updated_str}\t{entry.link}")