···
-
This file is a merged representation of the entire codebase, combined into a single document by Repomix.
-
This section contains a summary of this file.
-
This file contains a packed representation of the entire repository's contents.
-
It is designed to be easily consumable by AI systems for analysis, code review,
-
or other automated processes.
-
The content is organized as follows:
-
1. This summary section
-
2. Repository information
-
4. Repository files (if enabled)
-
5. Multiple file entries, each consisting of:
-
- File path as an attribute
-
- Full contents of the file
-
- This file should be treated as read-only. Any changes should be made to the
-
original repository files, not this packed version.
-
- When processing this file, use the file path to distinguish
-
between different files in the repository.
-
- Be aware that this file may contain sensitive information. Handle it with
-
the same level of security as you would the original repository.
-
- Some files may have been excluded based on .gitignore rules and Repomix's configuration
-
- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
-
- Files matching patterns in .gitignore are excluded
-
- Files matching default ignore patterns are excluded
-
- Files are sorted by Git change count (files with more changes are at the bottom)
-
This section contains the contents of the repository's files.
-
<file path=".claude/settings.local.json">
-
"enableAllProjectMcpServers": false
-
<file path="src/thicket/cli/commands/generate.py">
-
"""Generate static HTML website from thicket data."""
-
from datetime import datetime
-
from pathlib import Path
-
from typing import Any, Optional, TypedDict, Union
-
from jinja2 import Environment, FileSystemLoader, select_autoescape
-
from rich.progress import Progress, SpinnerColumn, TextColumn
-
from ...core.git_store import GitStore
-
from ...models.feed import AtomEntry
-
from ...models.user import GitStoreIndex, UserMetadata
-
from ..utils import console, load_config
-
class UserData(TypedDict):
-
"""Type definition for user data structure."""
-
recent_entries: list[tuple[str, AtomEntry]]
-
def safe_anchor_id(atom_id: str) -> str:
-
"""Convert an Atom ID to a safe HTML anchor ID."""
-
# Use base64 URL-safe encoding without padding
-
encoded = base64.urlsafe_b64encode(atom_id.encode('utf-8')).decode('ascii').rstrip('=')
-
# Prefix with 'id' to ensure it starts with a letter (HTML requirement)
-
class WebsiteGenerator:
-
"""Generate static HTML website from thicket data."""
-
def __init__(self, git_store: GitStore, output_dir: Path):
-
self.git_store = git_store
-
self.output_dir = output_dir
-
self.template_dir = Path(__file__).parent.parent.parent / "templates"
-
# Initialize Jinja2 environment
-
self.env = Environment(
-
loader=FileSystemLoader(self.template_dir),
-
autoescape=select_autoescape(["html", "xml"]),
-
self.index: Optional[GitStoreIndex] = None
-
self.entries: list[tuple[str, AtomEntry]] = [] # (username, entry)
-
self.links_data: Optional[dict[str, Any]] = None
-
self.threads: list[list[dict[str, Any]]] = [] # List of threads with metadata
-
def get_display_name(self, username: str) -> str:
-
"""Get display name for a user, falling back to username."""
-
if self.index and username in self.index.users:
-
user = self.index.users[username]
-
return user.display_name or username
-
def get_user_homepage(self, username: str) -> Optional[str]:
-
"""Get homepage URL for a user."""
-
if self.index and username in self.index.users:
-
user = self.index.users[username]
-
return str(user.homepage) if user.homepage else None
-
def clean_html_summary(self, content: Optional[str], max_length: int = 200) -> str:
-
"""Clean HTML content and truncate for display in timeline."""
-
clean_text = re.sub(r"<[^>]+>", " ", content)
-
# Replace multiple whitespace with single space
-
clean_text = re.sub(r"\s+", " ", clean_text)
-
# Strip leading/trailing whitespace
-
clean_text = clean_text.strip()
-
# Truncate with ellipsis if needed
-
if len(clean_text) > max_length:
-
# Try to break at word boundary
-
truncated = clean_text[:max_length]
-
last_space = truncated.rfind(" ")
-
last_space > max_length * 0.8
-
): # If we can break reasonably close to the limit
-
clean_text = truncated[:last_space] + "..."
-
clean_text = truncated + "..."
-
def load_data(self) -> None:
-
"""Load all data from the git repository."""
-
TextColumn("[progress.description]{task.description}"),
-
task = progress.add_task("Loading repository index...", total=None)
-
self.index = self.git_store._load_index()
-
raise ValueError("No index found in repository")
-
progress.update(task, completed=True)
-
task = progress.add_task("Loading entries...", total=None)
-
for username, user_metadata in self.index.users.items():
-
user_dir = self.git_store.repo_path / user_metadata.directory
-
for entry_file in user_dir.glob("*.json"):
-
if entry_file.name not in ["index.json", "duplicates.json"]:
-
with open(entry_file) as f:
-
entry_data = json.load(f)
-
entry = AtomEntry(**entry_data)
-
self.entries.append((username, entry))
-
f"[yellow]Warning: Failed to load {entry_file}: {e}[/yellow]"
-
progress.update(task, completed=True)
-
# Sort entries by date (newest first) - prioritize updated over published
-
key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
-
task = progress.add_task("Loading links and references...", total=None)
-
links_file = self.git_store.repo_path / "links.json"
-
if links_file.exists():
-
with open(links_file) as f:
-
self.links_data = json.load(f)
-
progress.update(task, completed=True)
-
def build_threads(self) -> None:
-
"""Build threaded conversations from references."""
-
if not self.links_data or "references" not in self.links_data:
-
# Map entry IDs to (username, entry) tuples
-
entry_map: dict[str, tuple[str, AtomEntry]] = {}
-
for username, entry in self.entries:
-
entry_map[entry.id] = (username, entry)
-
# Build adjacency lists for references
-
self.outbound_refs: dict[str, set[str]] = {}
-
self.inbound_refs: dict[str, set[str]] = {}
-
self.reference_details: dict[
-
str, list[dict[str, Any]]
-
] = {} # Store full reference info
-
for ref in self.links_data["references"]:
-
source_id = ref["source_entry_id"]
-
target_id = ref.get("target_entry_id")
-
if target_id and source_id in entry_map and target_id in entry_map:
-
self.outbound_refs.setdefault(source_id, set()).add(target_id)
-
self.inbound_refs.setdefault(target_id, set()).add(source_id)
-
# Store reference details for UI
-
self.reference_details.setdefault(source_id, []).append(
-
"target_id": target_id,
-
"target_username": ref.get("target_username"),
-
self.reference_details.setdefault(target_id, []).append(
-
"source_id": source_id,
-
"source_username": ref.get("source_username"),
-
# Find conversation threads (multi-post discussions)
-
for entry_id, (_username, _entry) in entry_map.items():
-
if entry_id in processed:
-
# Build thread starting from this entry
-
level_map: dict[str, int] = {} # Track levels for this thread
-
# First, traverse up to find the root
-
while current in self.inbound_refs:
-
parents = self.inbound_refs[current] - {
-
} # Exclude self-references
-
# Take the first parent
-
parent = next(iter(parents))
-
if parent in thread_ids: # Avoid cycles
-
to_visit.insert(0, current)
-
# Now traverse down from the root
-
current = to_visit.pop(0)
-
if current in thread_ids or current not in entry_map:
-
thread_ids.add(current)
-
username, entry = entry_map[current]
-
# Calculate thread level
-
thread_level = self._calculate_thread_level(current, level_map)
-
# Add threading metadata
-
"display_name": self.get_display_name(username),
-
"references_to": list(self.outbound_refs.get(current, [])),
-
"referenced_by": list(self.inbound_refs.get(current, [])),
-
"thread_level": thread_level,
-
thread.append(thread_entry)
-
if current in self.outbound_refs:
-
children = self.outbound_refs[current] - thread_ids # Avoid cycles
-
to_visit.extend(sorted(children))
-
if len(thread) > 1: # Only keep actual threads
-
# Sort thread by date (newest first) - prioritize updated over published
-
thread.sort(key=lambda x: x["entry"].updated or x["entry"].published or datetime.min, reverse=True) # type: ignore
-
self.threads.append(thread)
-
# Sort threads by the date of their most recent entry - prioritize updated over published
-
item["entry"].updated or item["entry"].published or datetime.min for item in t
-
def _calculate_thread_level(
-
self, entry_id: str, processed_entries: dict[str, int]
-
"""Calculate indentation level for threaded display."""
-
if entry_id in processed_entries:
-
return processed_entries[entry_id]
-
if entry_id not in self.inbound_refs:
-
processed_entries[entry_id] = 0
-
parents_in_thread = self.inbound_refs[entry_id] & set(processed_entries.keys())
-
if not parents_in_thread:
-
processed_entries[entry_id] = 0
-
# Find the deepest parent level + 1
-
for parent_id in parents_in_thread:
-
parent_level = self._calculate_thread_level(parent_id, processed_entries)
-
max_parent_level = max(max_parent_level, parent_level)
-
level = min(max_parent_level + 1, 4) # Cap at level 4
-
processed_entries[entry_id] = level
-
def get_standalone_references(self) -> list[dict[str, Any]]:
-
"""Get posts that have references but aren't part of multi-post threads."""
-
if not hasattr(self, "reference_details"):
-
threaded_entry_ids = set()
-
for thread in self.threads:
-
threaded_entry_ids.add(item["entry_id"])
-
for username, entry in self.entries:
-
entry.id in self.reference_details
-
and entry.id not in threaded_entry_ids
-
refs = self.reference_details[entry.id]
-
# Only include if it has meaningful references (not just self-references)
-
if r.get("target_id") != entry.id and r.get("source_id") != entry.id
-
standalone_refs.append(
-
"display_name": self.get_display_name(username),
-
"references": meaningful_refs,
-
def _add_cross_thread_links(self, timeline_items: list[dict[str, Any]]) -> None:
-
"""Add cross-thread linking for entries that appear in multiple threads."""
-
# Map entry IDs to their positions in the timeline
-
entry_positions: dict[str, list[int]] = {}
-
# Map URLs referenced by entries to the entries that reference them
-
url_references: dict[str, list[tuple[str, int]]] = {} # url -> [(entry_id, position)]
-
# First pass: collect all entry IDs, their positions, and referenced URLs
-
for i, item in enumerate(timeline_items):
-
if item["type"] == "post":
-
entry_id = item["content"]["entry"].id
-
entry_positions.setdefault(entry_id, []).append(i)
-
# Track URLs this entry references
-
if entry_id in self.reference_details:
-
for ref in self.reference_details[entry_id]:
-
if ref["type"] == "outbound" and "target_id" in ref:
-
# Find the target entry's URL if available
-
target_entry = self._find_entry_by_id(ref["target_id"])
-
if target_entry and target_entry.link:
-
url = str(target_entry.link)
-
url_references.setdefault(url, []).append((entry_id, i))
-
elif item["type"] == "thread":
-
for thread_item in item["content"]:
-
entry_id = thread_item["entry"].id
-
entry_positions.setdefault(entry_id, []).append(i)
-
# Track URLs this entry references
-
if entry_id in self.reference_details:
-
for ref in self.reference_details[entry_id]:
-
if ref["type"] == "outbound" and "target_id" in ref:
-
target_entry = self._find_entry_by_id(ref["target_id"])
-
if target_entry and target_entry.link:
-
url = str(target_entry.link)
-
url_references.setdefault(url, []).append((entry_id, i))
-
# Build cross-thread connections - only for entries that actually appear multiple times
-
cross_thread_connections: dict[str, set[int]] = {} # entry_id -> set of timeline positions
-
# Add connections ONLY for entries that appear multiple times in the timeline
-
for entry_id, positions in entry_positions.items():
-
cross_thread_connections[entry_id] = set(positions)
-
# Debug: uncomment to see which entries have multiple appearances
-
# print(f"Entry {entry_id[:50]}... appears at positions: {positions}")
-
# Apply cross-thread links to timeline items
-
for entry_id, positions_set in cross_thread_connections.items():
-
positions_list = list(positions_set)
-
for pos in positions_list:
-
item = timeline_items[pos]
-
other_positions = sorted([p for p in positions_list if p != pos])
-
if item["type"] == "post":
-
# Add cross-thread info to individual posts
-
item["content"]["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
-
# Add info about shared references
-
item["content"]["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
-
elif item["type"] == "thread":
-
# Add cross-thread info to thread items
-
for thread_item in item["content"]:
-
if thread_item["entry"].id == entry_id:
-
thread_item["cross_thread_links"] = self._build_cross_thread_link_data(entry_id, other_positions, timeline_items)
-
thread_item["shared_references"] = self._get_shared_references(entry_id, positions_set, timeline_items)
-
def _build_cross_thread_link_data(self, entry_id: str, other_positions: list[int], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
-
"""Build detailed cross-thread link data with anchor information."""
-
cross_thread_links = []
-
for pos in other_positions:
-
item = timeline_items[pos]
-
if item["type"] == "post":
-
safe_id = safe_anchor_id(entry_id)
-
cross_thread_links.append({
-
"anchor_id": f"post-{pos}-{safe_id}",
-
"context": "individual post",
-
"title": item["content"]["entry"].title
-
elif item["type"] == "thread":
-
# For thread items, find the specific thread item
-
for thread_idx, thread_item in enumerate(item["content"]):
-
if thread_item["entry"].id == entry_id:
-
safe_id = safe_anchor_id(entry_id)
-
cross_thread_links.append({
-
"anchor_id": f"post-{pos}-{thread_idx}-{safe_id}",
-
"context": f"thread (level {thread_item.get('thread_level', 0)})",
-
"title": thread_item["entry"].title
-
return cross_thread_links
-
def _find_entry_by_id(self, entry_id: str) -> Optional[AtomEntry]:
-
"""Find an entry by its ID."""
-
for _username, entry in self.entries:
-
if entry.id == entry_id:
-
def _get_shared_references(self, entry_id: str, positions: Union[set[int], list[int]], timeline_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
-
"""Get information about shared references between cross-thread entries."""
-
# Collect all referenced URLs from entries at these positions
-
url_counts: dict[str, int] = {}
-
referencing_entries: dict[str, list[str]] = {} # url -> [entry_ids]
-
item = timeline_items[pos]
-
if item["type"] == "post":
-
entries_to_check.append(item["content"]["entry"])
-
elif item["type"] == "thread":
-
entries_to_check.extend([ti["entry"] for ti in item["content"]])
-
for entry in entries_to_check:
-
if entry.id in self.reference_details:
-
for ref in self.reference_details[entry.id]:
-
if ref["type"] == "outbound" and "target_id" in ref:
-
target_entry = self._find_entry_by_id(ref["target_id"])
-
if target_entry and target_entry.link:
-
url = str(target_entry.link)
-
url_counts[url] = url_counts.get(url, 0) + 1
-
if url not in referencing_entries:
-
referencing_entries[url] = []
-
if entry.id not in referencing_entries[url]:
-
referencing_entries[url].append(entry.id)
-
# Find URLs referenced by multiple entries
-
for url, count in url_counts.items():
-
if count > 1 and len(referencing_entries[url]) > 1:
-
# Get the target entry info
-
for ref in (self.links_data or {}).get("references", []):
-
if ref.get("target_url") == url:
-
target_username = ref.get("target_username")
-
if ref.get("target_entry_id"):
-
target_entry = self._find_entry_by_id(ref["target_entry_id"])
-
"referencing_entries": referencing_entries[url],
-
"target_username": target_username,
-
"target_title": target_entry.title if target_entry else None
-
return sorted(shared_refs, key=lambda x: x["count"], reverse=True)
-
def generate_site(self) -> None:
-
"""Generate the static website."""
-
# Create output directory
-
self.output_dir.mkdir(parents=True, exist_ok=True)
-
# Create static directories
-
(self.output_dir / "css").mkdir(exist_ok=True)
-
(self.output_dir / "js").mkdir(exist_ok=True)
-
css_template = self.env.get_template("style.css")
-
css_content = css_template.render()
-
with open(self.output_dir / "css" / "style.css", "w") as f:
-
js_template = self.env.get_template("script.js")
-
js_content = js_template.render()
-
with open(self.output_dir / "js" / "script.js", "w") as f:
-
# Prepare common template data
-
"title": "Energy & Environment Group",
-
"generated_at": datetime.now().isoformat(),
-
"get_display_name": self.get_display_name,
-
"get_user_homepage": self.get_user_homepage,
-
"clean_html_summary": self.clean_html_summary,
-
"safe_anchor_id": safe_anchor_id,
-
# Build unified timeline
-
# Only consider the threads that will actually be displayed
-
displayed_threads = self.threads[:20] # Limit to 20 threads
-
# Track which entries are part of displayed threads
-
threaded_entry_ids = set()
-
for thread in displayed_threads:
-
threaded_entry_ids.add(item["entry_id"])
-
# Add threads to timeline (using the date of the most recent post)
-
for thread in displayed_threads:
-
most_recent_date = max(
-
item["entry"].updated or item["entry"].published or datetime.min
-
timeline_items.append({
-
"date": most_recent_date,
-
# Add individual posts (not in threads)
-
for username, entry in self.entries[:50]:
-
if entry.id not in threaded_entry_ids:
-
# Check if this entry has references
-
entry.id in self.reference_details
-
if hasattr(self, "reference_details")
-
refs = self.reference_details.get(entry.id, [])
-
if r.get("target_id") != entry.id
-
and r.get("source_id") != entry.id
-
timeline_items.append({
-
"date": entry.updated or entry.published or datetime.min,
-
"display_name": self.get_display_name(username),
-
"references": refs if refs else None
-
# Sort unified timeline by date (newest first)
-
timeline_items.sort(key=lambda x: x["date"], reverse=True)
-
# Limit timeline to what will actually be rendered
-
timeline_items = timeline_items[:50] # Limit to 50 items total
-
# Add cross-thread linking for repeat blog references
-
self._add_cross_thread_links(timeline_items)
-
# Prepare outgoing links data
-
if self.links_data and "links" in self.links_data:
-
for url, link_info in self.links_data["links"].items():
-
referencing_entries = []
-
for entry_id in link_info.get("referencing_entries", []):
-
for username, entry in self.entries:
-
if entry.id == entry_id:
-
referencing_entries.append(
-
(self.get_display_name(username), entry)
-
if referencing_entries:
-
# Sort by date - prioritize updated over published
-
referencing_entries.sort(
-
key=lambda x: x[1].updated or x[1].published or datetime.min, reverse=True
-
"target_username": link_info.get("target_username"),
-
"entries": referencing_entries,
-
# Sort links by most recent reference - prioritize updated over published
-
key=lambda x: x["entries"][0][1].updated
-
or x["entries"][0][1].published or datetime.min,
-
users: list[UserData] = []
-
for username, user_metadata in self.index.users.items():
-
# Get recent entries for this user with display names
-
(self.get_display_name(u), e)
-
for u, e in self.entries
-
{"metadata": user_metadata, "recent_entries": user_entries}
-
users.sort(key=lambda x: x["metadata"].entry_count, reverse=True)
-
# Generate timeline page
-
timeline_template = self.env.get_template("timeline.html")
-
timeline_content = timeline_template.render(
-
timeline_items=timeline_items, # Already limited above
-
with open(self.output_dir / "timeline.html", "w") as f:
-
f.write(timeline_content)
-
links_template = self.env.get_template("links.html")
-
links_content = links_template.render(
-
outgoing_links=outgoing_links[:100],
-
with open(self.output_dir / "links.html", "w") as f:
-
users_template = self.env.get_template("users.html")
-
users_content = users_template.render(
-
with open(self.output_dir / "users.html", "w") as f:
-
# Generate main index page (redirect to timeline)
-
index_template = self.env.get_template("index.html")
-
index_content = index_template.render(**base_data)
-
with open(self.output_dir / "index.html", "w") as f:
-
console.print(f"[green]โ[/green] Generated website at {self.output_dir}")
-
console.print(f" - {len(self.entries)} entries")
-
console.print(f" - {len(self.threads)} conversation threads")
-
console.print(f" - {len(outgoing_links)} outgoing links")
-
console.print(f" - {len(users)} users")
-
" - Generated pages: index.html, timeline.html, links.html, users.html"
-
output: Path = typer.Option(
-
Path("./thicket-site"),
-
help="Output directory for the generated website",
-
force: bool = typer.Option(
-
False, "--force", "-f", help="Overwrite existing output directory"
-
config_file: Path = typer.Option(
-
Path("thicket.yaml"), "--config", help="Configuration file path"
-
"""Generate a static HTML website from thicket data."""
-
config = load_config(config_file)
-
if not config.git_store:
-
console.print("[red]No git store path configured[/red]")
-
git_store = GitStore(config.git_store)
-
# Check if output directory exists
-
if output.exists() and not force:
-
f"[red]Output directory {output} already exists. Use --force to overwrite.[/red]"
-
# Clean output directory if forcing
-
if output.exists() and force:
-
generator = WebsiteGenerator(git_store, output)
-
console.print("[bold]Generating static website...[/bold]")
-
generator.build_threads()
-
generator.generate_site()
-
console.print(f"[red]Error generating website: {e}[/red]")
-
raise typer.Exit(1) from e
-
<file path="src/thicket/templates/base.html">
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
-
<title>{% block page_title %}{{ title }}{% endblock %}</title>
-
<link rel="stylesheet" href="css/style.css">
-
<header class="site-header">
-
<div class="header-content">
-
<h1 class="site-title">{{ title }}</h1>
-
<a href="timeline.html" class="nav-link {% if page == 'timeline' %}active{% endif %}">Timeline</a>
-
<a href="links.html" class="nav-link {% if page == 'links' %}active{% endif %}">Links</a>
-
<a href="users.html" class="nav-link {% if page == 'users' %}active{% endif %}">Users</a>
-
<main class="main-content">
-
{% block content %}{% endblock %}
-
<footer class="site-footer">
-
<p>Generated on {{ generated_at }} by <a href="https://github.com/avsm/thicket">Thicket</a></p>
-
<script src="js/script.js"></script>
-
<file path="src/thicket/templates/index.html">
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
-
<title>{{ title }}</title>
-
<meta http-equiv="refresh" content="0; url=timeline.html">
-
<link rel="canonical" href="timeline.html">
-
<p>Redirecting to <a href="timeline.html">Timeline</a>...</p>
-
<file path="src/thicket/templates/links.html">
-
{% extends "base.html" %}
-
{% block page_title %}Outgoing Links - {{ title }}{% endblock %}
-
<div class="page-content">
-
<h2>Outgoing Links</h2>
-
<p class="page-description">External links referenced in blog posts, ordered by most recent reference.</p>
-
{% for link in outgoing_links %}
-
<article class="link-group">
-
<a href="{{ link.url }}" target="_blank">{{ link.url|truncate(80) }}</a>
-
{% if link.target_username %}
-
<span class="target-user">({{ link.target_username }})</span>
-
<div class="referencing-entries">
-
<span class="ref-count">Referenced in {{ link.entries|length }} post(s):</span>
-
{% for display_name, entry in link.entries[:5] %}
-
<span class="author">{{ display_name }}</span> -
-
<a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
-
<time datetime="{{ entry.updated or entry.published }}">
-
({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
-
{% if link.entries|length > 5 %}
-
<li class="more">... and {{ link.entries|length - 5 }} more</li>
-
<file path="src/thicket/templates/script.js">
-
// Enhanced functionality for thicket website
-
document.addEventListener('DOMContentLoaded', function() {
-
// Enhance thread collapsing (optional feature)
-
const threadHeaders = document.querySelectorAll('.thread-header');
-
threadHeaders.forEach(header => {
-
header.style.cursor = 'pointer';
-
header.addEventListener('click', function() {
-
const thread = this.parentElement;
-
const entries = thread.querySelectorAll('.thread-entry');
-
// Toggle visibility of all but the first entry
-
for (let i = 1; i < entries.length; i++) {
-
entries[i].style.display = entries[i].style.display === 'none' ? 'block' : 'none';
-
// Update thread count text
-
const count = this.querySelector('.thread-count');
-
if (entries[1] && entries[1].style.display === 'none') {
-
count.textContent = count.textContent.replace('posts', 'posts (collapsed)');
-
count.textContent = count.textContent.replace(' (collapsed)', '');
-
// Add relative time display
-
const timeElements = document.querySelectorAll('time');
-
timeElements.forEach(timeEl => {
-
const datetime = new Date(timeEl.getAttribute('datetime'));
-
const now = new Date();
-
const diffMs = now - datetime;
-
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
-
const diffHours = Math.floor(diffMs / (1000 * 60 * 60));
-
const diffMinutes = Math.floor(diffMs / (1000 * 60));
-
relativeTime = diffMinutes === 0 ? 'just now' : `${diffMinutes}m ago`;
-
relativeTime = `${diffHours}h ago`;
-
} else if (diffDays === 1) {
-
relativeTime = 'yesterday';
-
} else if (diffDays < 7) {
-
relativeTime = `${diffDays}d ago`;
-
} else if (diffDays < 30) {
-
const weeks = Math.floor(diffDays / 7);
-
relativeTime = weeks === 1 ? '1w ago' : `${weeks}w ago`;
-
} else if (diffDays < 365) {
-
const months = Math.floor(diffDays / 30);
-
relativeTime = months === 1 ? '1mo ago' : `${months}mo ago`;
-
const years = Math.floor(diffDays / 365);
-
relativeTime = years === 1 ? '1y ago' : `${years}y ago`;
-
// Add relative time as title attribute
-
timeEl.setAttribute('title', timeEl.textContent);
-
timeEl.textContent = relativeTime;
-
// Enhanced anchor link scrolling for shared references
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
-
anchor.addEventListener('click', function (e) {
-
const target = document.querySelector(this.getAttribute('href'));
-
target.scrollIntoView({
-
// Highlight the target briefly
-
const timelineEntry = target.closest('.timeline-entry');
-
timelineEntry.style.outline = '2px solid var(--primary-color)';
-
timelineEntry.style.borderRadius = '8px';
-
timelineEntry.style.outline = '';
-
timelineEntry.style.borderRadius = '';
-
<file path="src/thicket/templates/style.css">
-
/* Modern, clean design with high-density text and readable theme */
-
--primary-color: #2c3e50;
-
--secondary-color: #3498db;
-
--accent-color: #e74c3c;
-
--text-primary: #2c3e50;
-
--text-secondary: #7f8c8d;
-
--border-color: #e0e0e0;
-
box-sizing: border-box;
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif;
-
color: var(--text-primary);
-
background-color: var(--background);
-
background-color: var(--surface);
-
border-bottom: 1px solid var(--border-color);
-
max-width: var(--max-width);
-
justify-content: space-between;
-
color: var(--primary-color);
-
color: var(--text-secondary);
-
padding: 0.5rem 0.75rem;
-
transition: all 0.2s ease;
-
color: var(--primary-color);
-
background-color: var(--background);
-
color: var(--secondary-color);
-
background-color: var(--background);
-
max-width: var(--max-width);
-
color: var(--text-secondary);
-
margin-bottom: 0.75rem;
-
color: var(--primary-color);
-
margin-bottom: 0.75rem;
-
color: var(--primary-color);
-
/* Entries and Threads */
-
background-color: var(--surface);
-
border: 1px solid var(--border-color);
-
/* Timeline-style entries */
-
padding: 0.5rem 0.75rem;
-
background: transparent;
-
transition: background-color 0.2s ease;
-
.timeline-entry:hover {
-
background-color: var(--surface);
-
color: var(--text-secondary);
-
margin-bottom: 0.25rem;
-
font-family: 'SF Mono', Monaco, Consolas, 'Courier New', monospace;
-
color: var(--text-secondary);
-
color: var(--primary-color);
-
.timeline-author:hover {
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--primary-color);
-
.timeline-title a:hover {
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--text-secondary);
-
/* Legacy styles for other sections */
-
.entry-meta, .thread-header {
-
color: var(--text-secondary);
-
color: var(--primary-color);
-
color: var(--primary-color);
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--text-primary);
-
/* Enhanced Threading Styles */
-
/* Conversation Clusters */
-
.conversation-cluster {
-
background-color: var(--background);
-
border: 2px solid var(--border-color);
-
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
-
background: linear-gradient(135deg, var(--surface) 0%, #f1f3f4 100%);
-
border-bottom: 1px solid var(--border-color);
-
justify-content: space-between;
-
color: var(--secondary-color);
-
.conversation-participants {
-
color: var(--text-secondary);
-
/* Threaded Conversation Entries */
-
margin-bottom: 0.75rem;
-
align-items: flex-start;
-
.conversation-entry.level-0 {
-
.conversation-entry.level-1 {
-
.conversation-entry.level-2 {
-
.conversation-entry.level-3 {
-
.conversation-entry.level-4 {
-
background-color: var(--secondary-color);
-
.conversation-entry.level-0 .entry-connector {
-
background-color: var(--accent-color);
-
background-color: var(--surface);
-
border: 1px solid var(--border-color);
-
transition: all 0.2s ease;
-
border-color: var(--secondary-color);
-
box-shadow: 0 2px 8px rgba(52, 152, 219, 0.1);
-
/* Reference Indicators */
-
.reference-indicators {
-
background-color: #e8f5e8;
-
background-color: #e8f0ff;
-
/* Reference Badges for Individual Posts */
-
.timeline-entry.with-references {
-
background-color: var(--surface);
-
/* Conversation posts in unified timeline */
-
.timeline-entry.conversation-post {
-
background: transparent;
-
padding: 0.5rem 0.75rem;
-
.timeline-entry.conversation-post.level-0 {
-
border-left: 2px solid var(--accent-color);
-
.timeline-entry.conversation-post.level-1 {
-
border-left: 2px solid var(--secondary-color);
-
.timeline-entry.conversation-post.level-2 {
-
border-left: 2px solid var(--text-secondary);
-
.timeline-entry.conversation-post.level-3 {
-
border-left: 2px solid var(--text-secondary);
-
.timeline-entry.conversation-post.level-4 {
-
border-left: 2px solid var(--text-secondary);
-
/* Cross-thread linking */
-
border-top: 1px solid var(--border-color);
-
.cross-thread-indicator {
-
color: var(--text-secondary);
-
background-color: var(--surface);
-
padding: 0.25rem 0.5rem;
-
border: 1px solid var(--border-color);
-
/* Inline shared references styling */
-
color: var(--text-secondary);
-
color: var(--primary-color);
-
transition: color 0.2s ease;
-
.shared-ref-link:hover {
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--text-secondary);
-
.user-anchor, .post-anchor {
-
margin-top: -60px; /* Offset for fixed header */
-
color: var(--primary-color);
-
transition: color 0.2s ease;
-
.cross-thread-link:hover {
-
color: var(--secondary-color);
-
text-decoration: underline;
-
padding: 0.1rem 0.4rem;
-
text-transform: uppercase;
-
letter-spacing: 0.05em;
-
.ref-badge.ref-outbound {
-
background-color: #e8f5e8;
-
border: 1px solid #c3e6c3;
-
.ref-badge.ref-inbound {
-
background-color: #e8f0ff;
-
border: 1px solid #b3d9ff;
-
/* Author Color Coding */
-
.timeline-author::before {
-
background-color: var(--secondary-color);
-
/* Generate consistent colors for authors */
-
.author-avsm::before { background-color: #e74c3c; }
-
.author-mort::before { background-color: #3498db; }
-
.author-mte::before { background-color: #2ecc71; }
-
.author-ryan::before { background-color: #f39c12; }
-
.author-mwd::before { background-color: #9b59b6; }
-
.author-dra::before { background-color: #1abc9c; }
-
.author-pf341::before { background-color: #34495e; }
-
.author-sadiqj::before { background-color: #e67e22; }
-
.author-martinkl::before { background-color: #8e44ad; }
-
.author-jonsterling::before { background-color: #27ae60; }
-
.author-jon::before { background-color: #f1c40f; }
-
.author-onkar::before { background-color: #e91e63; }
-
.author-gabriel::before { background-color: #00bcd4; }
-
.author-jess::before { background-color: #ff5722; }
-
.author-ibrahim::before { background-color: #607d8b; }
-
.author-andres::before { background-color: #795548; }
-
.author-eeg::before { background-color: #ff9800; }
-
.conversations-section h3,
-
.referenced-posts-section h3,
-
.individual-posts-section h3 {
-
border-bottom: 2px solid var(--border-color);
-
padding-bottom: 0.5rem;
-
.conversations-section h3::before {
-
.referenced-posts-section h3::before {
-
.individual-posts-section h3::before {
-
/* Legacy thread styles (for backward compatibility) */
-
background-color: var(--background);
-
border: 1px solid var(--border-color);
-
background-color: var(--surface);
-
padding: 0.5rem 0.75rem;
-
border-bottom: 1px solid var(--border-color);
-
color: var(--secondary-color);
-
padding: 0.5rem 0.75rem;
-
border-bottom: 1px solid var(--border-color);
-
.thread-entry:last-child {
-
margin-left: var(--thread-indent);
-
border-left: 3px solid var(--secondary-color);
-
background-color: var(--surface);
-
background-color: var(--background);
-
word-break: break-word;
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--text-secondary);
-
color: var(--text-secondary);
-
.referencing-entries ul {
-
.referencing-entries li {
-
margin-bottom: 0.25rem;
-
.referencing-entries .more {
-
color: var(--text-secondary);
-
background-color: var(--background);
-
margin-bottom: 0.25rem;
-
color: var(--text-secondary);
-
color: var(--text-secondary);
-
color: var(--secondary-color);
-
text-decoration: underline;
-
color: var(--text-secondary);
-
margin-bottom: 0.25rem;
-
max-width: var(--max-width);
-
margin: 3rem auto 2rem;
-
color: var(--text-secondary);
-
border-top: 1px solid var(--border-color);
-
color: var(--secondary-color);
-
text-decoration: underline;
-
@media (max-width: 768px) {
-
flex-direction: column;
-
align-items: flex-start;
-
margin-left: calc(var(--thread-indent) / 2);
-
flex-direction: column;
-
<file path="src/thicket/templates/timeline.html">
-
{% extends "base.html" %}
-
{% block page_title %}Timeline - {{ title }}{% endblock %}
-
{% set seen_users = [] %}
-
<div class="page-content">
-
<h2>Recent Posts & Conversations</h2>
-
<section class="unified-timeline">
-
{% for item in timeline_items %}
-
{% if item.type == "post" %}
-
<!-- Individual Post -->
-
<article class="timeline-entry {% if item.content.references %}with-references{% endif %}">
-
<div class="timeline-meta">
-
<time datetime="{{ item.content.entry.updated or item.content.entry.published }}" class="timeline-time">
-
{{ (item.content.entry.updated or item.content.entry.published).strftime('%Y-%m-%d %H:%M') }}
-
{% set homepage = get_user_homepage(item.content.username) %}
-
{% if item.content.username not in seen_users %}
-
<a id="{{ item.content.username }}" class="user-anchor"></a>
-
{% set _ = seen_users.append(item.content.username) %}
-
<a id="post-{{ loop.index0 }}-{{ safe_anchor_id(item.content.entry.id) }}" class="post-anchor"></a>
-
<a href="{{ homepage }}" target="_blank" class="timeline-author">{{ item.content.display_name }}</a>
-
<span class="timeline-author">{{ item.content.display_name }}</span>
-
{% if item.content.references %}
-
<div class="reference-badges">
-
{% for ref in item.content.references %}
-
{% if ref.type == 'outbound' %}
-
<span class="ref-badge ref-outbound" title="References {{ ref.target_username or 'external post' }}">
-
โ {{ ref.target_username or 'ext' }}
-
{% elif ref.type == 'inbound' %}
-
<span class="ref-badge ref-inbound" title="Referenced by {{ ref.source_username or 'external post' }}">
-
โ {{ ref.source_username or 'ext' }}
-
<div class="timeline-content">
-
<strong class="timeline-title">
-
<a href="{{ item.content.entry.link }}" target="_blank">{{ item.content.entry.title }}</a>
-
{% if item.content.entry.summary %}
-
<span class="timeline-summary">โ {{ clean_html_summary(item.content.entry.summary, 250) }}</span>
-
{% if item.content.shared_references %}
-
<span class="inline-shared-refs">
-
{% for ref in item.content.shared_references[:3] %}
-
{% if ref.target_username %}
-
<a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
-
{% if item.content.shared_references|length > 3 %}
-
<span class="shared-ref-more">+{{ item.content.shared_references|length - 3 }} more</span>
-
{% if item.content.cross_thread_links %}
-
<div class="cross-thread-links">
-
<span class="cross-thread-indicator">๐ Also appears: </span>
-
{% for link in item.content.cross_thread_links %}
-
<a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
-
{% elif item.type == "thread" %}
-
<!-- Conversation Thread -->
-
{% set outer_loop_index = loop.index0 %}
-
{% for thread_item in item.content %}
-
<article class="timeline-entry conversation-post level-{{ thread_item.thread_level }}">
-
<div class="timeline-meta">
-
<time datetime="{{ thread_item.entry.updated or thread_item.entry.published }}" class="timeline-time">
-
{{ (thread_item.entry.updated or thread_item.entry.published).strftime('%Y-%m-%d %H:%M') }}
-
{% set homepage = get_user_homepage(thread_item.username) %}
-
{% if thread_item.username not in seen_users %}
-
<a id="{{ thread_item.username }}" class="user-anchor"></a>
-
{% set _ = seen_users.append(thread_item.username) %}
-
<a id="post-{{ outer_loop_index }}-{{ loop.index0 }}-{{ safe_anchor_id(thread_item.entry.id) }}" class="post-anchor"></a>
-
<a href="{{ homepage }}" target="_blank" class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</a>
-
<span class="timeline-author author-{{ thread_item.username }}">{{ thread_item.display_name }}</span>
-
{% if thread_item.references_to or thread_item.referenced_by %}
-
<span class="reference-indicators">
-
{% if thread_item.references_to %}
-
<span class="ref-out" title="References other posts">โ</span>
-
{% if thread_item.referenced_by %}
-
<span class="ref-in" title="Referenced by other posts">โ</span>
-
<div class="timeline-content">
-
<strong class="timeline-title">
-
<a href="{{ thread_item.entry.link }}" target="_blank">{{ thread_item.entry.title }}</a>
-
{% if thread_item.entry.summary %}
-
<span class="timeline-summary">โ {{ clean_html_summary(thread_item.entry.summary, 300) }}</span>
-
{% if thread_item.shared_references %}
-
<span class="inline-shared-refs">
-
{% for ref in thread_item.shared_references[:3] %}
-
{% if ref.target_username %}
-
<a href="#{{ ref.target_username }}" class="shared-ref-link" title="Referenced by {{ ref.count }} entries">@{{ ref.target_username }}</a>{% if not loop.last %}, {% endif %}
-
{% if thread_item.shared_references|length > 3 %}
-
<span class="shared-ref-more">+{{ thread_item.shared_references|length - 3 }} more</span>
-
{% if thread_item.cross_thread_links %}
-
<div class="cross-thread-links">
-
<span class="cross-thread-indicator">๐ Also appears: </span>
-
{% for link in thread_item.cross_thread_links %}
-
<a href="#{{ link.anchor_id }}" class="cross-thread-link" title="{{ link.title }}">{{ link.context }}</a>{% if not loop.last %}, {% endif %}
-
<file path="src/thicket/templates/users.html">
-
{% extends "base.html" %}
-
{% block page_title %}Users - {{ title }}{% endblock %}
-
<div class="page-content">
-
<p class="page-description">All users contributing to this thicket, ordered by post count.</p>
-
{% for user_info in users %}
-
<article class="user-card">
-
<div class="user-header">
-
{% if user_info.metadata.icon and user_info.metadata.icon != "None" %}
-
<img src="{{ user_info.metadata.icon }}" alt="{{ user_info.metadata.username }}" class="user-icon">
-
<div class="user-info">
-
{% if user_info.metadata.display_name %}
-
{{ user_info.metadata.display_name }}
-
<span class="username">({{ user_info.metadata.username }})</span>
-
{{ user_info.metadata.username }}
-
<div class="user-meta">
-
{% if user_info.metadata.homepage %}
-
<a href="{{ user_info.metadata.homepage }}" target="_blank">{{ user_info.metadata.homepage }}</a>
-
{% if user_info.metadata.email %}
-
<span class="separator">โข</span>
-
<a href="mailto:{{ user_info.metadata.email }}">{{ user_info.metadata.email }}</a>
-
<span class="separator">โข</span>
-
<span class="post-count">{{ user_info.metadata.entry_count }} posts</span>
-
{% if user_info.recent_entries %}
-
<div class="user-recent">
-
{% for display_name, entry in user_info.recent_entries %}
-
<a href="{{ entry.link }}" target="_blank">{{ entry.title }}</a>
-
<time datetime="{{ entry.updated or entry.published }}">
-
({{ (entry.updated or entry.published).strftime('%Y-%m-%d') }})
-
<file path="README.md">
-
A modern CLI tool for persisting Atom/RSS feeds in Git repositories, designed to enable distributed webblog comment structures.
-
- **Feed Auto-Discovery**: Automatically extracts user metadata from Atom/RSS feeds
-
- **Git Storage**: Stores feed entries in a Git repository with full history
-
- **Duplicate Management**: Manual curation of duplicate entries across feeds
-
- **Modern CLI**: Built with Typer and Rich for beautiful terminal output
-
- **Comprehensive Parsing**: Supports RSS 0.9x, RSS 1.0, RSS 2.0, and Atom feeds
-
- **Cron-Friendly**: Designed for scheduled execution
-
# Or install with dev dependencies
-
1. **Initialize a new thicket repository:**
-
thicket init ./my-feeds
-
2. **Add a user with their feed:**
-
thicket add user "alice" --feed "https://alice.example.com/feed.xml"
-
3. **Sync feeds to download entries:**
-
4. **List users and feeds:**
-
thicket init <git-store-path> [--cache-dir <path>] [--config <config-file>]
-
### Add Users and Feeds
-
# Add user with auto-discovery
-
thicket add user "username" --feed "https://example.com/feed.xml"
-
# Add user with manual metadata
-
thicket add user "username" \
-
--feed "https://example.com/feed.xml" \
-
--email "user@example.com" \
-
--homepage "https://example.com" \
-
--display-name "User Name"
-
# Add additional feed to existing user
-
thicket add feed "username" "https://example.com/other-feed.xml"
-
thicket sync --user "username"
-
# Dry run (preview changes)
-
thicket sync --all --dry-run
-
# List feeds for specific user
-
thicket list feeds --user "username"
-
thicket list entries --limit 20
-
# List entries for specific user
-
thicket list entries --user "username"
-
# List duplicate mappings
-
thicket duplicates list
-
# Mark entries as duplicates
-
thicket duplicates add "https://example.com/dup" "https://example.com/canonical"
-
# Remove duplicate mapping
-
thicket duplicates remove "https://example.com/dup"
-
Thicket uses a YAML configuration file (default: `thicket.yaml`):
-
git_store: ./feeds-repo
-
cache_dir: ~/.cache/thicket
-
- https://alice.example.com/feed.xml
-
email: alice@example.com
-
homepage: https://alice.example.com
-
## Git Repository Structure
-
โโโ index.json # User directory index
-
โโโ duplicates.json # Duplicate entry mappings
-
โ โโโ metadata.json # User metadata
-
โ โโโ entry_id_1.json # Feed entries
-
โ โโโ entry_id_2.json
-
# Install in development mode
-
- **CLI**: Modern interface with Typer and Rich
-
- **Feed Processing**: Universal parsing with feedparser
-
- **Git Storage**: Structured storage with GitPython
-
- **Data Models**: Pydantic for validation and serialization
-
- **Async HTTP**: httpx for efficient feed fetching
-
- **Blog Aggregation**: Collect and archive blog posts from multiple sources
-
- **Comment Networks**: Enable distributed commenting systems
-
- **Feed Archival**: Preserve feed history beyond typical feed depth limits
-
- **Content Curation**: Manage and deduplicate content across feeds
-
MIT License - see LICENSE file for details.
-
<file path="src/thicket/cli/commands/index_cmd.py">
-
"""CLI command for building reference index from blog entries."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.console import Console
-
from rich.progress import (
-
from rich.table import Table
-
from ...core.git_store import GitStore
-
from ...core.reference_parser import ReferenceIndex, ReferenceParser
-
from ..utils import get_tsv_mode, load_config
-
config_file: Optional[Path] = typer.Option(
-
help="Path to configuration file",
-
output_file: Optional[Path] = typer.Option(
-
help="Path to output index file (default: updates links.json in git store)",
-
verbose: bool = typer.Option(
-
help="Show detailed progress information",
-
"""Build a reference index showing which blog entries reference others.
-
This command analyzes all blog entries to detect cross-references between
-
different blogs, creating an index that can be used to build threaded
-
views of related content.
-
Updates the unified links.json file with reference data.
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
# Initialize reference parser
-
parser = ReferenceParser()
-
# Build user domain mapping
-
console.print("Building user domain mapping...")
-
user_domains = parser.build_user_domain_mapping(git_store)
-
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
-
# Initialize reference index
-
ref_index = ReferenceIndex()
-
ref_index.user_domains = user_domains
-
index = git_store._load_index()
-
users = list(index.users.keys())
-
console.print("[yellow]No users found in Git store[/yellow]")
-
TextColumn("[progress.description]{task.description}"),
-
# Count total entries first
-
counting_task = progress.add_task("Counting entries...", total=len(users))
-
entries = git_store.list_entries(username)
-
entry_counts[username] = len(entries)
-
total_entries += len(entries)
-
progress.advance(counting_task)
-
progress.remove_task(counting_task)
-
# Process entries - extract references
-
processing_task = progress.add_task(
-
f"Extracting references from {total_entries} entries...",
-
entries = git_store.list_entries(username)
-
# Extract references from this entry
-
references = parser.extract_references(entry, username, user_domains)
-
all_references.extend(references)
-
progress.advance(processing_task)
-
if verbose and references:
-
console.print(f" Found {len(references)} references in {username}:{entry.title[:50]}...")
-
progress.remove_task(processing_task)
-
# Resolve target_entry_ids for references
-
resolve_task = progress.add_task(
-
f"Resolving {len(all_references)} references...",
-
total=len(all_references)
-
console.print(f"Resolving target entry IDs for {len(all_references)} references...")
-
resolved_references = parser.resolve_target_entry_ids(all_references, git_store)
-
# Count resolved references
-
resolved_count = sum(1 for ref in resolved_references if ref.target_entry_id is not None)
-
console.print(f"Resolved {resolved_count} out of {len(all_references)} references")
-
# Add resolved references to index
-
for ref in resolved_references:
-
ref_index.add_reference(ref)
-
progress.advance(resolve_task)
-
progress.remove_task(resolve_task)
-
# Determine output path
-
output_path = output_file
-
output_path = config.git_store / "links.json"
-
# Load existing links data or create new structure
-
if output_path.exists() and not output_file:
-
# Load existing unified structure
-
with open(output_path) as f:
-
existing_data = json.load(f)
-
# Update with reference data
-
existing_data["references"] = ref_index.to_dict()["references"]
-
existing_data["user_domains"] = {k: list(v) for k, v in user_domains.items()}
-
# Save updated structure
-
with open(output_path, "w") as f:
-
json.dump(existing_data, f, indent=2, default=str)
-
console.print("\n[green]โ Reference index built successfully[/green]")
-
# Create summary table or TSV output
-
print(f"Total Users\t{len(users)}")
-
print(f"Total Entries\t{total_entries}")
-
print(f"Total References\t{total_references}")
-
print(f"Outbound Refs\t{len(ref_index.outbound_refs)}")
-
print(f"Inbound Refs\t{len(ref_index.inbound_refs)}")
-
print(f"Output File\t{output_path}")
-
table = Table(title="Reference Index Summary")
-
table.add_column("Metric", style="cyan")
-
table.add_column("Count", style="green")
-
table.add_row("Total Users", str(len(users)))
-
table.add_row("Total Entries", str(total_entries))
-
table.add_row("Total References", str(total_references))
-
table.add_row("Outbound Refs", str(len(ref_index.outbound_refs)))
-
table.add_row("Inbound Refs", str(len(ref_index.inbound_refs)))
-
table.add_row("Output File", str(output_path))
-
# Show some interesting statistics
-
if total_references > 0:
-
console.print("\n[bold]Reference Statistics:[/bold]")
-
# Most referenced users
-
unresolved_domains = set()
-
for ref in ref_index.references:
-
if ref.target_username:
-
target_counts[ref.target_username] = target_counts.get(ref.target_username, 0) + 1
-
# Track unresolved domains
-
from urllib.parse import urlparse
-
domain = urlparse(ref.target_url).netloc.lower()
-
unresolved_domains.add(domain)
-
print("Referenced User\tReference Count")
-
for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
-
print(f"{username}\t{count}")
-
console.print("\nMost referenced users:")
-
for username, count in sorted(target_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
-
console.print(f" {username}: {count} references")
-
if unresolved_domains and verbose:
-
print("Unresolved Domain\tCount")
-
for domain in sorted(list(unresolved_domains)[:10]):
-
if len(unresolved_domains) > 10:
-
print(f"... and {len(unresolved_domains) - 10} more\t...")
-
console.print(f"\nUnresolved domains: {len(unresolved_domains)}")
-
for domain in sorted(list(unresolved_domains)[:10]):
-
console.print(f" {domain}")
-
if len(unresolved_domains) > 10:
-
console.print(f" ... and {len(unresolved_domains) - 10} more")
-
console.print(f"[red]Error building reference index: {e}[/red]")
-
console.print_exception()
-
config_file: Optional[Path] = typer.Option(
-
help="Path to configuration file",
-
index_file: Optional[Path] = typer.Option(
-
help="Path to reference index file (default: links.json in git store)",
-
username: Optional[str] = typer.Option(
-
help="Show threads for specific username only",
-
entry_id: Optional[str] = typer.Option(
-
help="Show thread for specific entry ID",
-
min_size: int = typer.Option(
-
help="Minimum thread size to display",
-
"""Show threaded view of related blog entries.
-
This command uses the reference index to show which blog entries
-
are connected through cross-references, creating an email-style
-
threaded view of the conversation.
-
Reads reference data from the unified links.json file.
-
config = load_config(config_file)
-
# Determine index file path
-
index_path = index_file
-
index_path = config.git_store / "links.json"
-
if not index_path.exists():
-
console.print(f"[red]Links file not found: {index_path}[/red]")
-
console.print("Run 'thicket links' and 'thicket index' first to build the reference index")
-
with open(index_path) as f:
-
unified_data = json.load(f)
-
# Check if references exist in the unified structure
-
if "references" not in unified_data:
-
console.print(f"[red]No references found in {index_path}[/red]")
-
console.print("Run 'thicket index' first to build the reference index")
-
# Extract reference data and reconstruct ReferenceIndex
-
ref_index = ReferenceIndex.from_dict({
-
"references": unified_data["references"],
-
"user_domains": unified_data.get("user_domains", {})
-
# Initialize Git store to get entry details
-
git_store = GitStore(config.git_store)
-
if entry_id and username:
-
thread_members = ref_index.get_thread_members(username, entry_id)
-
_display_thread(thread_members, ref_index, git_store, f"Thread for {username}:{entry_id}")
-
# Show all threads involving this user
-
user_index = git_store._load_index()
-
user = user_index.get_user(username)
-
console.print(f"[red]User not found: {username}[/red]")
-
entries = git_store.list_entries(username)
-
console.print(f"[bold]Threads involving {username}:[/bold]\n")
-
thread_members = ref_index.get_thread_members(username, entry.id)
-
if len(thread_members) >= min_size:
-
thread_key = tuple(sorted(thread_members))
-
if thread_key not in threads_found:
-
threads_found.add(thread_key)
-
_display_thread(thread_members, ref_index, git_store, f"Thread #{len(threads_found)}")
-
console.print("[bold]All conversation threads:[/bold]\n")
-
processed_entries = set()
-
user_index = git_store._load_index()
-
for username in user_index.users.keys():
-
entries = git_store.list_entries(username)
-
entry_key = (username, entry.id)
-
if entry_key in processed_entries:
-
thread_members = ref_index.get_thread_members(username, entry.id)
-
if len(thread_members) >= min_size:
-
thread_key = tuple(sorted(thread_members))
-
if thread_key not in all_threads:
-
all_threads.add(thread_key)
-
_display_thread(thread_members, ref_index, git_store, f"Thread #{len(all_threads)}")
-
# Mark all members as processed
-
for member in thread_members:
-
processed_entries.add(member)
-
console.print("[yellow]No conversation threads found[/yellow]")
-
console.print(f"(minimum thread size: {min_size})")
-
console.print(f"[red]Error showing threads: {e}[/red]")
-
def _display_thread(thread_members, ref_index, git_store, title):
-
"""Display a single conversation thread."""
-
console.print(f"[bold cyan]{title}[/bold cyan]")
-
console.print(f"Thread size: {len(thread_members)} entries")
-
# Get entry details for each member
-
for username, entry_id in thread_members:
-
entry = git_store.get_entry(username, entry_id)
-
thread_entries.append((username, entry))
-
# Sort by publication date
-
thread_entries.sort(key=lambda x: x[1].published or x[1].updated)
-
for i, (username, entry) in enumerate(thread_entries):
-
prefix = "โโ" if i < len(thread_entries) - 1 else "โโ"
-
# Get references for this entry
-
outbound = ref_index.get_outbound_refs(username, entry.id)
-
inbound = ref_index.get_inbound_refs(username, entry.id)
-
if outbound or inbound:
-
ref_info = f" ({len(outbound)} out, {len(inbound)} in)"
-
console.print(f" {prefix} [{username}] {entry.title[:60]}...{ref_info}")
-
console.print(f" Published: {entry.published.strftime('%Y-%m-%d')}")
-
console.print() # Empty line after each thread
-
<file path="src/thicket/cli/commands/info_cmd.py">
-
"""CLI command for displaying detailed information about a specific atom entry."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.console import Console
-
from rich.panel import Panel
-
from rich.table import Table
-
from rich.text import Text
-
from ...core.git_store import GitStore
-
from ...core.reference_parser import ReferenceIndex
-
from ..utils import load_config, get_tsv_mode
-
identifier: str = typer.Argument(
-
help="The atom ID or URL of the entry to display information about"
-
username: Optional[str] = typer.Option(
-
help="Username to search for the entry (if not provided, searches all users)"
-
config_file: Optional[Path] = typer.Option(
-
help="Path to configuration file",
-
show_content: bool = typer.Option(
-
help="Include the full content of the entry in the output"
-
"""Display detailed information about a specific atom entry.
-
You can specify the entry using either its atom ID or URL.
-
Shows all metadata for the given entry, including title, dates, categories,
-
and summarizes all inbound and outbound links to/from other posts.
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
# Check if identifier looks like a URL
-
is_url = identifier.startswith(('http://', 'https://'))
-
# Search specific username
-
entries = git_store.list_entries(username)
-
if str(e.link) == identifier:
-
found_username = username
-
entry = git_store.get_entry(username, identifier)
-
found_username = username
-
index = git_store._load_index()
-
for user in index.users.keys():
-
entries = git_store.list_entries(user)
-
if str(e.link) == identifier:
-
entry = git_store.get_entry(user, identifier)
-
if not entry or not found_username:
-
console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found for user '{username}'[/red]")
-
console.print(f"[red]Entry with {'URL' if is_url else 'atom ID'} '{identifier}' not found in any user's entries[/red]")
-
# Load reference index if available
-
links_path = config.git_store / "links.json"
-
if links_path.exists():
-
with open(links_path) as f:
-
unified_data = json.load(f)
-
# Check if references exist in the unified structure
-
if "references" in unified_data:
-
ref_index = ReferenceIndex.from_dict({
-
"references": unified_data["references"],
-
"user_domains": unified_data.get("user_domains", {})
-
_display_entry_info_tsv(entry, found_username, ref_index, show_content)
-
_display_entry_info(entry, found_username)
-
_display_link_info(entry, found_username, ref_index)
-
console.print("\n[yellow]No reference index found. Run 'thicket links' and 'thicket index' to build cross-reference data.[/yellow]")
-
# Optionally display content
-
if show_content and entry.content:
-
_display_content(entry.content)
-
console.print(f"[red]Error displaying entry info: {e}[/red]")
-
def _display_entry_info(entry, username: str) -> None:
-
"""Display basic entry information in a structured format."""
-
# Create main info panel
-
info_table = Table.grid(padding=(0, 2))
-
info_table.add_column("Field", style="cyan bold", width=15)
-
info_table.add_column("Value", style="white")
-
info_table.add_row("User", f"[green]{username}[/green]")
-
info_table.add_row("Atom ID", f"[blue]{entry.id}[/blue]")
-
info_table.add_row("Title", entry.title)
-
info_table.add_row("Link", str(entry.link))
-
info_table.add_row("Published", entry.published.strftime("%Y-%m-%d %H:%M:%S UTC"))
-
info_table.add_row("Updated", entry.updated.strftime("%Y-%m-%d %H:%M:%S UTC"))
-
# Truncate long summaries
-
summary = entry.summary[:200] + "..." if len(entry.summary) > 200 else entry.summary
-
info_table.add_row("Summary", summary)
-
categories_text = ", ".join(entry.categories)
-
info_table.add_row("Categories", categories_text)
-
if "name" in entry.author:
-
author_info.append(entry.author["name"])
-
if "email" in entry.author:
-
author_info.append(f"<{entry.author['email']}>")
-
info_table.add_row("Author", " ".join(author_info))
-
info_table.add_row("Content Type", entry.content_type)
-
info_table.add_row("Rights", entry.rights)
-
info_table.add_row("Source Feed", entry.source)
-
title=f"[bold]Entry Information[/bold]",
-
def _display_link_info(entry, username: str, ref_index: ReferenceIndex) -> None:
-
"""Display inbound and outbound link information."""
-
outbound_refs = ref_index.get_outbound_refs(username, entry.id)
-
inbound_refs = ref_index.get_inbound_refs(username, entry.id)
-
if not outbound_refs and not inbound_refs:
-
console.print("\n[dim]No cross-references found for this entry.[/dim]")
-
links_table = Table(title="Cross-References")
-
links_table.add_column("Direction", style="cyan", width=10)
-
links_table.add_column("Target/Source", style="green", width=20)
-
links_table.add_column("URL", style="blue", width=50)
-
# Add outbound references
-
for ref in outbound_refs:
-
target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
-
links_table.add_row("โ Out", target_info, ref.target_url)
-
# Add inbound references
-
for ref in inbound_refs:
-
source_info = f"{ref.source_username}:{ref.source_entry_id}"
-
links_table.add_row("โ In", source_info, ref.target_url)
-
console.print(links_table)
-
console.print(f"\n[bold]Summary:[/bold] {len(outbound_refs)} outbound, {len(inbound_refs)} inbound references")
-
def _display_content(content: str) -> None:
-
"""Display the full content of the entry."""
-
# Truncate very long content
-
display_content = content
-
if len(content) > 5000:
-
display_content = content[:5000] + "\n\n[... content truncated ...]"
-
title="[bold]Entry Content[/bold]",
-
def _display_entry_info_tsv(entry, username: str, ref_index: Optional[ReferenceIndex], show_content: bool) -> None:
-
"""Display entry information in TSV format."""
-
print(f"User\t{username}")
-
print(f"Atom ID\t{entry.id}")
-
print(f"Title\t{entry.title.replace(chr(9), ' ').replace(chr(10), ' ').replace(chr(13), ' ')}")
-
print(f"Link\t{entry.link}")
-
print(f"Published\t{entry.published.strftime('%Y-%m-%d %H:%M:%S UTC')}")
-
print(f"Updated\t{entry.updated.strftime('%Y-%m-%d %H:%M:%S UTC')}")
-
# Escape tabs and newlines in summary
-
summary = entry.summary.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
-
print(f"Summary\t{summary}")
-
print(f"Categories\t{', '.join(entry.categories)}")
-
if "name" in entry.author:
-
author_info.append(entry.author["name"])
-
if "email" in entry.author:
-
author_info.append(f"<{entry.author['email']}>")
-
print(f"Author\t{' '.join(author_info)}")
-
print(f"Content Type\t{entry.content_type}")
-
print(f"Rights\t{entry.rights}")
-
print(f"Source Feed\t{entry.source}")
-
# Add reference info if available
-
outbound_refs = ref_index.get_outbound_refs(username, entry.id)
-
inbound_refs = ref_index.get_inbound_refs(username, entry.id)
-
print(f"Outbound References\t{len(outbound_refs)}")
-
print(f"Inbound References\t{len(inbound_refs)}")
-
for ref in outbound_refs:
-
target_info = f"{ref.target_username}:{ref.target_entry_id}" if ref.target_username and ref.target_entry_id else "External"
-
print(f"Outbound Reference\t{target_info}\t{ref.target_url}")
-
for ref in inbound_refs:
-
source_info = f"{ref.source_username}:{ref.source_entry_id}"
-
print(f"Inbound Reference\t{source_info}\t{ref.target_url}")
-
# Show content if requested
-
if show_content and entry.content:
-
# Escape tabs and newlines in content
-
content = entry.content.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
-
print(f"Content\t{content}")
-
<file path="src/thicket/cli/commands/init.py">
-
"""Initialize command for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from pydantic import ValidationError
-
from ...core.git_store import GitStore
-
from ...models import ThicketConfig
-
from ..utils import print_error, print_success, save_config
-
git_store: Path = typer.Argument(..., help="Path to Git repository for storing feeds"),
-
cache_dir: Optional[Path] = typer.Option(
-
None, "--cache-dir", "-c", help="Cache directory (default: ~/.cache/thicket)"
-
config_file: Optional[Path] = typer.Option(
-
None, "--config", help="Configuration file path (default: thicket.yaml)"
-
force: bool = typer.Option(
-
False, "--force", "-f", help="Overwrite existing configuration"
-
"""Initialize a new thicket configuration and Git store."""
-
from platformdirs import user_cache_dir
-
cache_dir = Path(user_cache_dir("thicket"))
-
if config_file is None:
-
config_file = Path("thicket.yaml")
-
# Check if config already exists
-
if config_file.exists() and not force:
-
print_error(f"Configuration file already exists: {config_file}")
-
print_error("Use --force to overwrite")
-
# Create cache directory
-
cache_dir.mkdir(parents=True, exist_ok=True)
-
print_success(f"Initialized Git store at: {git_store}")
-
print_error(f"Failed to initialize Git store: {e}")
-
raise typer.Exit(1) from e
-
config = ThicketConfig(
-
save_config(config, config_file)
-
print_success(f"Created configuration file: {config_file}")
-
except ValidationError as e:
-
print_error(f"Invalid configuration: {e}")
-
raise typer.Exit(1) from e
-
print_error(f"Failed to create configuration: {e}")
-
raise typer.Exit(1) from e
-
print_success("Thicket initialized successfully!")
-
print_success(f"Git store: {git_store}")
-
print_success(f"Cache directory: {cache_dir}")
-
print_success(f"Configuration: {config_file}")
-
print_success("Run 'thicket add user' to add your first user and feed.")
-
<file path="src/thicket/cli/__init__.py">
-
"""CLI interface for thicket."""
-
<file path="src/thicket/core/__init__.py">
-
"""Core business logic for thicket."""
-
from .feed_parser import FeedParser
-
from .git_store import GitStore
-
__all__ = ["FeedParser", "GitStore"]
-
<file path="src/thicket/core/feed_parser.py">
-
"""Feed parsing and normalization with auto-discovery."""
-
from datetime import datetime
-
from typing import Optional
-
from urllib.parse import urlparse
-
from pydantic import HttpUrl, ValidationError
-
from ..models import AtomEntry, FeedMetadata
-
"""Parser for RSS/Atom feeds with normalization and auto-discovery."""
-
def __init__(self, user_agent: str = "thicket/0.1.0"):
-
"""Initialize the feed parser."""
-
self.user_agent = user_agent
-
"a", "abbr", "acronym", "b", "blockquote", "br", "code", "em",
-
"i", "li", "ol", "p", "pre", "strong", "ul", "h1", "h2", "h3",
-
"h4", "h5", "h6", "img", "div", "span",
-
self.allowed_attributes = {
-
"a": ["href", "title"],
-
"img": ["src", "alt", "title", "width", "height"],
-
"blockquote": ["cite"],
-
async def fetch_feed(self, url: HttpUrl) -> str:
-
"""Fetch feed content from URL."""
-
async with httpx.AsyncClient() as client:
-
response = await client.get(
-
headers={"User-Agent": self.user_agent},
-
response.raise_for_status()
-
def parse_feed(self, content: str, source_url: Optional[HttpUrl] = None) -> tuple[FeedMetadata, list[AtomEntry]]:
-
"""Parse feed content and return metadata and entries."""
-
parsed = feedparser.parse(content)
-
if parsed.bozo and parsed.bozo_exception:
-
# Try to continue with potentially malformed feed
-
# Extract feed metadata
-
feed_meta = self._extract_feed_metadata(parsed.feed)
-
# Extract and normalize entries
-
for entry in parsed.entries:
-
atom_entry = self._normalize_entry(entry, source_url)
-
entries.append(atom_entry)
-
# Log error but continue processing other entries
-
print(f"Error processing entry {getattr(entry, 'id', 'unknown')}: {e}")
-
return feed_meta, entries
-
def _extract_feed_metadata(self, feed: feedparser.FeedParserDict) -> FeedMetadata:
-
"""Extract metadata from feed for auto-discovery."""
-
# Parse author information
-
if hasattr(feed, 'author_detail'):
-
author_name = feed.author_detail.get('name')
-
author_email = feed.author_detail.get('email')
-
author_uri = feed.author_detail.get('href')
-
elif hasattr(feed, 'author'):
-
author_name = feed.author
-
# Parse managing editor for RSS feeds
-
if not author_email and hasattr(feed, 'managingEditor'):
-
author_email = feed.managingEditor
-
if hasattr(feed, 'link'):
-
feed_link = HttpUrl(feed.link)
-
except ValidationError:
-
# Parse image/icon/logo
-
if hasattr(feed, 'image'):
-
image_url = HttpUrl(feed.image.get('href', feed.image.get('url', '')))
-
except (ValidationError, AttributeError):
-
if hasattr(feed, 'icon'):
-
icon = HttpUrl(feed.icon)
-
except ValidationError:
-
if hasattr(feed, 'logo'):
-
logo = HttpUrl(feed.logo)
-
except ValidationError:
-
title=getattr(feed, 'title', None),
-
author_name=author_name,
-
author_email=author_email,
-
author_uri=HttpUrl(author_uri) if author_uri else None,
-
description=getattr(feed, 'description', None),
-
def _normalize_entry(self, entry: feedparser.FeedParserDict, source_url: Optional[HttpUrl] = None) -> AtomEntry:
-
"""Normalize an entry to Atom format."""
-
updated = self._parse_timestamp(entry.get('updated_parsed') or entry.get('published_parsed'))
-
published = self._parse_timestamp(entry.get('published_parsed'))
-
content = self._extract_content(entry)
-
content_type = self._extract_content_type(entry)
-
author = self._extract_author(entry)
-
# Parse categories/tags
-
if hasattr(entry, 'tags'):
-
categories = [tag.get('term', '') for tag in entry.tags if tag.get('term')]
-
# Sanitize HTML content
-
content = self._sanitize_html(content)
-
summary = entry.get('summary', '')
-
summary = self._sanitize_html(summary)
-
id=entry.get('id', entry.get('link', '')),
-
title=entry.get('title', ''),
-
link=HttpUrl(entry.get('link', '')),
-
summary=summary or None,
-
content=content or None,
-
content_type=content_type,
-
rights=entry.get('rights', None),
-
source=str(source_url) if source_url else None,
-
def _parse_timestamp(self, time_struct) -> datetime:
-
"""Parse feedparser time struct to datetime."""
-
return datetime(*time_struct[:6])
-
def _extract_content(self, entry: feedparser.FeedParserDict) -> Optional[str]:
-
"""Extract the best content from an entry."""
-
# Prefer content over summary
-
if hasattr(entry, 'content') and entry.content:
-
# Find the best content (prefer text/html, then text/plain)
-
for content_item in entry.content:
-
if content_item.get('type') in ['text/html', 'html']:
-
return content_item.get('value', '')
-
elif content_item.get('type') in ['text/plain', 'text']:
-
return content_item.get('value', '')
-
# Fallback to first content item
-
return entry.content[0].get('value', '')
-
return entry.get('summary', '')
-
def _extract_content_type(self, entry: feedparser.FeedParserDict) -> str:
-
"""Extract content type from entry."""
-
if hasattr(entry, 'content') and entry.content:
-
content_type = entry.content[0].get('type', 'html')
-
# Normalize content type
-
if content_type in ['text/html', 'html']:
-
elif content_type in ['text/plain', 'text']:
-
elif content_type == 'xhtml':
-
def _extract_author(self, entry: feedparser.FeedParserDict) -> Optional[dict]:
-
"""Extract author information from entry."""
-
if hasattr(entry, 'author_detail'):
-
'name': entry.author_detail.get('name'),
-
'email': entry.author_detail.get('email'),
-
'uri': entry.author_detail.get('href'),
-
elif hasattr(entry, 'author'):
-
author['name'] = entry.author
-
return author if author else None
-
def _sanitize_html(self, html: str) -> str:
-
"""Sanitize HTML content to prevent XSS."""
-
tags=self.allowed_tags,
-
attributes=self.allowed_attributes,
-
def sanitize_entry_id(self, entry_id: str) -> str:
-
"""Sanitize entry ID to be a safe filename."""
-
# Parse URL to get meaningful parts
-
parsed = urlparse(entry_id)
-
# Start with the path component
-
# Remove leading slash and replace problematic characters
-
safe_id = parsed.path.lstrip('/').replace('/', '_').replace('\\', '_')
-
# Use the entire ID as fallback
-
# Replace problematic characters
-
if char.isalnum() or char in '-_.':
-
safe_chars.append(char)
-
safe_id = ''.join(safe_chars)
-
# Ensure it's not too long (max 200 chars)
-
safe_id = safe_id[:200]
-
# Ensure it's not empty
-
<file path="src/thicket/core/reference_parser.py">
-
"""Reference detection and parsing for blog entries."""
-
from typing import Optional
-
from urllib.parse import urlparse
-
from ..models import AtomEntry
-
"""Represents a reference from one blog entry to another."""
-
target_username: Optional[str] = None,
-
target_entry_id: Optional[str] = None,
-
self.source_entry_id = source_entry_id
-
self.source_username = source_username
-
self.target_url = target_url
-
self.target_username = target_username
-
self.target_entry_id = target_entry_id
-
def to_dict(self) -> dict:
-
"""Convert to dictionary for JSON serialization."""
-
"source_entry_id": self.source_entry_id,
-
"source_username": self.source_username,
-
"target_url": self.target_url,
-
# Only include optional fields if they are not None
-
if self.target_username is not None:
-
result["target_username"] = self.target_username
-
if self.target_entry_id is not None:
-
result["target_entry_id"] = self.target_entry_id
-
def from_dict(cls, data: dict) -> "BlogReference":
-
"""Create from dictionary."""
-
source_entry_id=data["source_entry_id"],
-
source_username=data["source_username"],
-
target_url=data["target_url"],
-
target_username=data.get("target_username"),
-
target_entry_id=data.get("target_entry_id"),
-
"""Index of blog-to-blog references for creating threaded views."""
-
self.references: list[BlogReference] = []
-
self.outbound_refs: dict[
-
str, list[BlogReference]
-
] = {} # entry_id -> outbound refs
-
self.inbound_refs: dict[
-
str, list[BlogReference]
-
] = {} # entry_id -> inbound refs
-
self.user_domains: dict[str, set[str]] = {} # username -> set of domains
-
def add_reference(self, ref: BlogReference) -> None:
-
"""Add a reference to the index."""
-
self.references.append(ref)
-
# Update outbound references
-
source_key = f"{ref.source_username}:{ref.source_entry_id}"
-
if source_key not in self.outbound_refs:
-
self.outbound_refs[source_key] = []
-
self.outbound_refs[source_key].append(ref)
-
# Update inbound references if we can identify the target
-
if ref.target_username and ref.target_entry_id:
-
target_key = f"{ref.target_username}:{ref.target_entry_id}"
-
if target_key not in self.inbound_refs:
-
self.inbound_refs[target_key] = []
-
self.inbound_refs[target_key].append(ref)
-
def get_outbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
-
"""Get all outbound references from an entry."""
-
key = f"{username}:{entry_id}"
-
return self.outbound_refs.get(key, [])
-
def get_inbound_refs(self, username: str, entry_id: str) -> list[BlogReference]:
-
"""Get all inbound references to an entry."""
-
key = f"{username}:{entry_id}"
-
return self.inbound_refs.get(key, [])
-
def get_thread_members(self, username: str, entry_id: str) -> set[tuple[str, str]]:
-
"""Get all entries that are part of the same thread."""
-
to_visit = [(username, entry_id)]
-
current_user, current_entry = to_visit.pop()
-
if (current_user, current_entry) in visited:
-
visited.add((current_user, current_entry))
-
thread_members.add((current_user, current_entry))
-
# Add outbound references
-
for ref in self.get_outbound_refs(current_user, current_entry):
-
if ref.target_username and ref.target_entry_id:
-
to_visit.append((ref.target_username, ref.target_entry_id))
-
# Add inbound references
-
for ref in self.get_inbound_refs(current_user, current_entry):
-
to_visit.append((ref.source_username, ref.source_entry_id))
-
def to_dict(self) -> dict:
-
"""Convert to dictionary for JSON serialization."""
-
"references": [ref.to_dict() for ref in self.references],
-
"user_domains": {k: list(v) for k, v in self.user_domains.items()},
-
def from_dict(cls, data: dict) -> "ReferenceIndex":
-
"""Create from dictionary."""
-
for ref_data in data.get("references", []):
-
ref = BlogReference.from_dict(ref_data)
-
index.add_reference(ref)
-
for username, domains in data.get("user_domains", {}).items():
-
index.user_domains[username] = set(domains)
-
"""Parses blog entries to detect references to other blogs."""
-
# Common blog platforms and patterns
-
r"https?://[^/]+\.(?:org|com|net|io|dev|me|co\.uk)/.*", # Common blog domains
-
r"https?://[^/]+\.github\.io/.*", # GitHub Pages
-
r"https?://[^/]+\.substack\.com/.*", # Substack
-
r"https?://medium\.com/.*", # Medium
-
r"https?://[^/]+\.wordpress\.com/.*", # WordPress.com
-
r"https?://[^/]+\.blogspot\.com/.*", # Blogger
-
# Compile regex patterns
-
self.link_pattern = re.compile(
-
r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL
-
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
-
def extract_links_from_html(self, html_content: str) -> list[tuple[str, str]]:
-
"""Extract all links from HTML content."""
-
# Extract links from <a> tags
-
for match in self.link_pattern.finditer(html_content):
-
r"<[^>]+>", "", match.group(2)
-
).strip() # Remove HTML tags from link text
-
links.append((url, text))
-
def is_blog_url(self, url: str) -> bool:
-
"""Check if a URL likely points to a blog post."""
-
for pattern in self.blog_patterns:
-
if re.match(pattern, url):
-
def _is_likely_blog_post_url(self, url: str) -> bool:
-
"""Check if a same-domain URL likely points to a blog post (not CSS, images, etc.)."""
-
parsed_url = urlparse(url)
-
path = parsed_url.path.lower()
-
# Skip obvious non-blog content
-
if any(path.endswith(ext) for ext in ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.pdf', '.xml', '.json']):
-
# Skip common non-blog paths
-
if any(segment in path for segment in ['/static/', '/assets/', '/css/', '/js/', '/images/', '/img/', '/media/', '/uploads/']):
-
# Skip fragment-only links (same page anchors)
-
if not path or path == '/':
-
# Look for positive indicators of blog posts
-
# Common blog post patterns: dates, slugs, post indicators
-
r'/\d{4}/', # Year in path
-
r'/\d{4}/\d{2}/', # Year/month in path
-
for pattern in blog_indicators:
-
if re.search(pattern, path):
-
# If it has a reasonable path depth and doesn't match exclusions, likely a blog post
-
path_segments = [seg for seg in path.split('/') if seg]
-
return len(path_segments) >= 1 # At least one meaningful path segment
-
def resolve_target_user(
-
self, url: str, user_domains: dict[str, set[str]]
-
"""Try to resolve a URL to a known user based on domain mapping."""
-
parsed_url = urlparse(url)
-
domain = parsed_url.netloc.lower()
-
for username, domains in user_domains.items():
-
def extract_references(
-
self, entry: AtomEntry, username: str, user_domains: dict[str, set[str]]
-
) -> list[BlogReference]:
-
"""Extract all blog references from an entry."""
-
# Combine all text content for analysis
-
content_to_search.append(entry.content)
-
content_to_search.append(entry.summary)
-
for content in content_to_search:
-
links = self.extract_links_from_html(content)
-
for url, _link_text in links:
-
urlparse(str(entry.link)).netloc.lower() if entry.link else ""
-
link_domain = urlparse(url).netloc.lower()
-
# Check if this looks like a blog URL
-
if not self.is_blog_url(url):
-
# For same-domain links, apply additional filtering to avoid non-blog content
-
if link_domain == entry_domain:
-
# Only include same-domain links that look like blog posts
-
if not self._is_likely_blog_post_url(url):
-
# Try to resolve to a known user
-
if link_domain == entry_domain:
-
# Same domain - target user is the same as source user
-
target_username: Optional[str] = username
-
# Different domain - try to resolve
-
target_username = self.resolve_target_user(url, user_domains)
-
source_entry_id=entry.id,
-
source_username=username,
-
target_username=target_username,
-
target_entry_id=None, # Will be resolved later if possible
-
def build_user_domain_mapping(self, git_store: "GitStore") -> dict[str, set[str]]:
-
"""Build mapping of usernames to their known domains."""
-
index = git_store._load_index()
-
for username, user_metadata in index.users.items():
-
# Add domains from feeds
-
for feed_url in user_metadata.feeds:
-
domain = urlparse(feed_url).netloc.lower()
-
# Add domain from homepage
-
if user_metadata.homepage:
-
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
-
user_domains[username] = domains
-
def _build_url_to_entry_mapping(self, git_store: "GitStore") -> dict[str, str]:
-
"""Build a comprehensive mapping from URLs to entry IDs using git store data.
-
This creates a bidirectional mapping that handles:
-
- Entry link URLs -> Entry IDs
-
- URL variations (with/without www, http/https)
-
- Multiple URLs pointing to the same entry
-
url_to_entry: dict[str, str] = {}
-
# Load index to get all users
-
index = git_store._load_index()
-
for username in index.users.keys():
-
entries = git_store.list_entries(username)
-
link_url = str(entry.link)
-
# Map the canonical link URL
-
url_to_entry[link_url] = entry_id
-
# Handle common URL variations
-
parsed = urlparse(link_url)
-
if parsed.netloc and parsed.path:
-
# Add version without www
-
if parsed.netloc.startswith('www.'):
-
no_www_url = f"{parsed.scheme}://{parsed.netloc[4:]}{parsed.path}"
-
no_www_url += f"?{parsed.query}"
-
no_www_url += f"#{parsed.fragment}"
-
url_to_entry[no_www_url] = entry_id
-
# Add version with www if not present
-
elif not parsed.netloc.startswith('www.'):
-
www_url = f"{parsed.scheme}://www.{parsed.netloc}{parsed.path}"
-
www_url += f"?{parsed.query}"
-
www_url += f"#{parsed.fragment}"
-
url_to_entry[www_url] = entry_id
-
# Add http/https variations
-
if parsed.scheme == 'https':
-
http_url = link_url.replace('https://', 'http://', 1)
-
url_to_entry[http_url] = entry_id
-
elif parsed.scheme == 'http':
-
https_url = link_url.replace('http://', 'https://', 1)
-
url_to_entry[https_url] = entry_id
-
def _normalize_url(self, url: str) -> str:
-
"""Normalize URL for consistent matching.
-
Handles common variations like trailing slashes, fragments, etc.
-
# Remove trailing slash from path
-
path = parsed.path.rstrip('/') if parsed.path != '/' else parsed.path
-
# Reconstruct without fragment for consistent matching
-
normalized = f"{parsed.scheme}://{parsed.netloc}{path}"
-
normalized += f"?{parsed.query}"
-
def resolve_target_entry_ids(
-
self, references: list[BlogReference], git_store: "GitStore"
-
) -> list[BlogReference]:
-
"""Resolve target_entry_id for references using comprehensive URL mapping."""
-
# Build comprehensive URL to entry ID mapping
-
url_to_entry = self._build_url_to_entry_mapping(git_store)
-
# If we already have a target_entry_id, keep the reference as-is
-
if ref.target_entry_id is not None:
-
resolved_refs.append(ref)
-
# If we don't have a target_username, we can't resolve it
-
if ref.target_username is None:
-
resolved_refs.append(ref)
-
# Try to resolve using URL mapping
-
resolved_entry_id = None
-
# First, try exact match
-
if ref.target_url in url_to_entry:
-
resolved_entry_id = url_to_entry[ref.target_url]
-
# Try normalized URL matching
-
normalized_target = self._normalize_url(ref.target_url)
-
if normalized_target in url_to_entry:
-
resolved_entry_id = url_to_entry[normalized_target]
-
for mapped_url, entry_id in url_to_entry.items():
-
if self._normalize_url(mapped_url) == normalized_target:
-
resolved_entry_id = entry_id
-
# Verify the resolved entry belongs to the target username
-
# Double-check by loading the actual entry
-
entries = git_store.list_entries(ref.target_username)
-
entry_found = any(entry.id == resolved_entry_id for entry in entries)
-
resolved_entry_id = None
-
# Create a new reference with the resolved target_entry_id
-
resolved_ref = BlogReference(
-
source_entry_id=ref.source_entry_id,
-
source_username=ref.source_username,
-
target_url=ref.target_url,
-
target_username=ref.target_username,
-
target_entry_id=resolved_entry_id,
-
resolved_refs.append(resolved_ref)
-
<file path="src/thicket/models/__init__.py">
-
"""Data models for thicket."""
-
from .config import ThicketConfig, UserConfig
-
from .feed import AtomEntry, DuplicateMap, FeedMetadata
-
from .user import GitStoreIndex, UserMetadata
-
<file path="src/thicket/models/feed.py">
-
"""Feed and entry models for thicket."""
-
from datetime import datetime
-
from typing import TYPE_CHECKING, Optional
-
from pydantic import BaseModel, ConfigDict, EmailStr, HttpUrl
-
from .config import UserConfig
-
class AtomEntry(BaseModel):
-
"""Represents an Atom feed entry stored in the Git repository."""
-
model_config = ConfigDict(
-
json_encoders={datetime: lambda v: v.isoformat()},
-
str_strip_whitespace=True,
-
id: str # Original Atom ID
-
published: Optional[datetime] = None
-
summary: Optional[str] = None
-
content: Optional[str] = None # Full body content from Atom entry
-
content_type: Optional[str] = "html" # text, html, xhtml
-
author: Optional[dict] = None
-
categories: list[str] = []
-
rights: Optional[str] = None # Copyright info
-
source: Optional[str] = None # Source feed URL
-
class FeedMetadata(BaseModel):
-
"""Metadata extracted from a feed for auto-discovery."""
-
title: Optional[str] = None
-
author_name: Optional[str] = None
-
author_email: Optional[EmailStr] = None
-
author_uri: Optional[HttpUrl] = None
-
link: Optional[HttpUrl] = None
-
logo: Optional[HttpUrl] = None
-
icon: Optional[HttpUrl] = None
-
image_url: Optional[HttpUrl] = None
-
description: Optional[str] = None
-
def to_user_config(self, username: str, feed_url: HttpUrl) -> "UserConfig":
-
"""Convert discovered metadata to UserConfig with fallbacks."""
-
from .config import UserConfig
-
display_name=self.author_name or self.title,
-
email=self.author_email,
-
homepage=self.author_uri or self.link,
-
icon=self.logo or self.icon or self.image_url,
-
class DuplicateMap(BaseModel):
-
"""Maps duplicate entry IDs to canonical entry IDs."""
-
duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
-
comment: str = "Entry IDs that map to the same canonical content"
-
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
-
"""Add a duplicate mapping."""
-
self.duplicates[duplicate_id] = canonical_id
-
def remove_duplicate(self, duplicate_id: str) -> bool:
-
"""Remove a duplicate mapping. Returns True if existed."""
-
return self.duplicates.pop(duplicate_id, None) is not None
-
def get_canonical(self, entry_id: str) -> str:
-
"""Get canonical ID for an entry (returns original if not duplicate)."""
-
return self.duplicates.get(entry_id, entry_id)
-
def is_duplicate(self, entry_id: str) -> bool:
-
"""Check if entry ID is marked as duplicate."""
-
return entry_id in self.duplicates
-
def get_duplicates_for_canonical(self, canonical_id: str) -> list[str]:
-
"""Get all duplicate IDs that map to a canonical ID."""
-
for duplicate_id, canonical in self.duplicates.items()
-
if canonical == canonical_id
-
<file path="src/thicket/models/user.py">
-
"""User metadata models for thicket."""
-
from datetime import datetime
-
from typing import Optional
-
from pydantic import BaseModel, ConfigDict
-
class UserMetadata(BaseModel):
-
"""Metadata about a user stored in the Git repository."""
-
model_config = ConfigDict(
-
json_encoders={datetime: lambda v: v.isoformat()},
-
str_strip_whitespace=True,
-
display_name: Optional[str] = None
-
email: Optional[str] = None
-
homepage: Optional[str] = None
-
icon: Optional[str] = None
-
directory: str # Directory name in Git store
-
def update_timestamp(self) -> None:
-
"""Update the last_updated timestamp to now."""
-
self.last_updated = datetime.now()
-
def increment_entry_count(self, count: int = 1) -> None:
-
"""Increment the entry count by the given amount."""
-
self.entry_count += count
-
self.update_timestamp()
-
class GitStoreIndex(BaseModel):
-
"""Index of all users and their directories in the Git store."""
-
model_config = ConfigDict(
-
json_encoders={datetime: lambda v: v.isoformat()}
-
users: dict[str, UserMetadata] = {} # username -> UserMetadata
-
def add_user(self, user_metadata: UserMetadata) -> None:
-
"""Add or update a user in the index."""
-
self.users[user_metadata.username] = user_metadata
-
self.last_updated = datetime.now()
-
def remove_user(self, username: str) -> bool:
-
"""Remove a user from the index. Returns True if user existed."""
-
if username in self.users:
-
del self.users[username]
-
self.last_updated = datetime.now()
-
def get_user(self, username: str) -> Optional[UserMetadata]:
-
"""Get user metadata by username."""
-
return self.users.get(username)
-
def update_entry_count(self, username: str, count: int) -> None:
-
"""Update entry count for a user and total."""
-
user = self.get_user(username)
-
user.increment_entry_count(count)
-
self.total_entries += count
-
self.last_updated = datetime.now()
-
def recalculate_totals(self) -> None:
-
"""Recalculate total entries from all users."""
-
self.total_entries = sum(user.entry_count for user in self.users.values())
-
self.last_updated = datetime.now()
-
<file path="src/thicket/utils/__init__.py">
-
"""Utility modules for thicket."""
-
# This module will contain shared utilities
-
# For now, it's empty but can be expanded with common functions
-
<file path="src/thicket/__init__.py">
-
"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
-
__email__ = "thicket@example.com"
-
<file path="src/thicket/__main__.py">
-
"""Entry point for running thicket as a module."""
-
from .cli.main import app
-
if __name__ == "__main__":
-
<file path=".gitignore">
-
# Byte-compiled / optimized / DLL files
-
# Distribution / packaging
-
# Usually these files are written by a python script from a template
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
-
pip-delete-this-directory.txt
-
# Unit test / coverage reports
-
# For a library or package, you might want to ignore these files since the code is
-
# intended to run in multiple environments; otherwise, check them in:
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
-
# install all needed dependencies.
-
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
-
# commonly ignored for libraries.
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
-
# commonly ignored for libraries.
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
-
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
-
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
-
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
-
# in the .venv directory. It is recommended not to include this directory in version control.
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-
# SageMath parsed files
-
# Spyder project settings
-
# Rope project settings
-
# pytype static type analyzer
-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
-
# Abstra is an AI-powered process automation framework.
-
# Ignore directories containing user credentials, local state, and settings.
-
# Learn more at https://abstra.io/docs
-
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
-
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
-
# and can be added to the global gitignore or merged into this file. However, if you prefer,
-
# you could uncomment the following to ignore the entire vscode folder
-
# PyPI configuration file
-
.streamlit/secrets.toml
-
<file path="CLAUDE.md">
-
My goal is to build a CLI tool called thicket in Python that maintains a Git repository within which Atom feeds can be persisted, including their contents.
-
# Python Environment and Package Management
-
This project uses `uv` for Python package management and virtual environment handling.
-
ALWAYS use `uv run` to execute Python commands:
-
- Run the CLI: `uv run -m thicket`
-
- Run tests: `uv run pytest`
-
- Type checking: `uv run mypy src/`
-
- Linting: `uv run ruff check src/`
-
- Format code: `uv run ruff format src/`
-
- Compile check: `uv run python -m py_compile <file>`
-
- Add dependencies: `uv add <package>`
-
- Add dev dependencies: `uv add --dev <package>`
-
- Install dependencies: `uv sync`
-
- Update dependencies: `uv lock --upgrade`
-
The configuration file specifies:
-
- the location of a git store
-
- a list of usernames and target Atom/RSS feed(s) and optional metadata about the username such as their email, homepage, icon and display name
-
- a cache directory to store temporary results such as feed downloads and their last modification date that speed up operations across runs of the tool
-
The Git data store should:
-
- have a subdirectory per user
-
- within that directory, an entry per Atom entry indexed by the Atom id for that entry. The id should be sanitised consistently to be a safe filename. RSS feed should be normalized to Atom before storing it.
-
- within each entry file, the metadata of the Atom feed converted into a JSON format that preserves as much metadata as possible.
-
- have a JSON file in the Git repository that indexes the users, their associated directories within the Git repository, and any other metadata about that user from the config file
-
The CLI should be modern and use cool progress bars and any otfrom ecosystem libraries.
-
The intention behind the Git repository is that it can be queried by other websites in order to build a webblog structure of comments that link to other blogs.
-
<file path="pyproject.toml">
-
requires = ["hatchling"]
-
build-backend = "hatchling.build"
-
description = "A CLI tool for persisting Atom/RSS feeds in Git repositories"
-
requires-python = ">=3.9"
-
{name = "thicket", email = "thicket@example.com"},
-
"Development Status :: 3 - Alpha",
-
"Intended Audience :: Developers",
-
"License :: OSI Approved :: MIT License",
-
"Operating System :: OS Independent",
-
"Programming Language :: Python :: 3",
-
"Programming Language :: Python :: 3.9",
-
"Programming Language :: Python :: 3.10",
-
"Programming Language :: Python :: 3.11",
-
"Programming Language :: Python :: 3.12",
-
"Programming Language :: Python :: 3.13",
-
"Topic :: Internet :: WWW/HTTP :: Dynamic Content :: News/Diary",
-
"Topic :: Software Development :: Version Control :: Git",
-
"Topic :: Text Processing :: Markup :: XML",
-
"pydantic-settings>=2.10.0",
-
[project.optional-dependencies]
-
"pytest-asyncio>=0.24.0",
-
Homepage = "https://github.com/example/thicket"
-
Documentation = "https://github.com/example/thicket"
-
Repository = "https://github.com/example/thicket"
-
"Bug Tracker" = "https://github.com/example/thicket/issues"
-
thicket = "thicket.cli.main:app"
-
path = "src/thicket/__init__.py"
-
[tool.hatch.build.targets.wheel]
-
packages = ["src/thicket"]
-
target-version = ['py39']
-
target-version = "py39"
-
"E", # pycodestyle errors
-
"W", # pycodestyle warnings
-
"C4", # flake8-comprehensions
-
"E501", # line too long, handled by black
-
"B008", # do not perform function calls in argument defaults
-
[tool.ruff.lint.per-file-ignores]
-
"__init__.py" = ["F401"]
-
check_untyped_defs = true
-
disallow_any_generics = true
-
disallow_incomplete_defs = true
-
disallow_untyped_defs = true
-
no_implicit_optional = true
-
warn_redundant_casts = true
-
warn_unused_ignores = true
-
[[tool.mypy.overrides]]
-
ignore_missing_imports = true
-
[tool.pytest.ini_options]
-
python_files = ["test_*.py"]
-
python_classes = ["Test*"]
-
python_functions = ["test_*"]
-
"--cov-report=term-missing",
-
"ignore::DeprecationWarning",
-
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
-
"integration: marks tests as integration tests",
-
"raise AssertionError",
-
"raise NotImplementedError",
-
"if __name__ == .__main__.:",
-
"class .*\\bProtocol\\):",
-
"@(abc\\.)?abstractmethod",
-
<file path="src/thicket/cli/commands/__init__.py">
-
"""CLI commands for thicket."""
-
# Import all commands to register them with the main app
-
from . import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
-
__all__ = ["add", "duplicates", "generate", "index_cmd", "info_cmd", "init", "links_cmd", "list_cmd", "sync"]
-
<file path="src/thicket/cli/commands/add.py">
-
"""Add command for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from pydantic import HttpUrl, ValidationError
-
from ...core.feed_parser import FeedParser
-
from ...core.git_store import GitStore
-
subcommand: str = typer.Argument(..., help="Subcommand: 'user' or 'feed'"),
-
username: str = typer.Argument(..., help="Username"),
-
feed_url: Optional[str] = typer.Argument(None, help="Feed URL (required for 'user' command)"),
-
email: Optional[str] = typer.Option(None, "--email", "-e", help="User email"),
-
homepage: Optional[str] = typer.Option(None, "--homepage", "-h", help="User homepage"),
-
icon: Optional[str] = typer.Option(None, "--icon", "-i", help="User icon URL"),
-
display_name: Optional[str] = typer.Option(None, "--display-name", "-d", help="User display name"),
-
config_file: Optional[Path] = typer.Option(
-
Path("thicket.yaml"), "--config", help="Configuration file path"
-
auto_discover: bool = typer.Option(
-
True, "--auto-discover/--no-auto-discover", help="Auto-discover user metadata from feed"
-
"""Add a user or feed to thicket."""
-
if subcommand == "user":
-
add_user(username, feed_url, email, homepage, icon, display_name, config_file, auto_discover)
-
elif subcommand == "feed":
-
add_feed(username, feed_url, config_file)
-
print_error(f"Unknown subcommand: {subcommand}")
-
print_error("Use 'user' or 'feed'")
-
feed_url: Optional[str],
-
homepage: Optional[str],
-
display_name: Optional[str],
-
"""Add a new user with feed."""
-
print_error("Feed URL is required when adding a user")
-
validated_feed_url = HttpUrl(feed_url)
-
except ValidationError:
-
print_error(f"Invalid feed URL: {feed_url}")
-
raise typer.Exit(1) from None
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
# Check if user already exists
-
existing_user = git_store.get_user(username)
-
print_error(f"User '{username}' already exists")
-
print_error("Use 'thicket add feed' to add additional feeds")
-
# Auto-discover metadata if enabled
-
discovered_metadata = None
-
discovered_metadata = asyncio.run(discover_feed_metadata(validated_feed_url))
-
# Prepare user data with manual overrides taking precedence
-
user_display_name = display_name or (discovered_metadata.author_name or discovered_metadata.title if discovered_metadata else None)
-
user_email = email or (discovered_metadata.author_email if discovered_metadata else None)
-
user_homepage = homepage or (str(discovered_metadata.author_uri or discovered_metadata.link) if discovered_metadata else None)
-
user_icon = icon or (str(discovered_metadata.logo or discovered_metadata.icon or discovered_metadata.image_url) if discovered_metadata else None)
-
# Add user to Git store
-
display_name=user_display_name,
-
homepage=user_homepage,
-
feeds=[str(validated_feed_url)],
-
git_store.commit_changes(f"Add user: {username}")
-
print_success(f"Added user '{username}' with feed: {feed_url}")
-
if discovered_metadata and auto_discover:
-
print_info("Auto-discovered metadata:")
-
print_info(f" Display name: {user_display_name}")
-
print_info(f" Email: {user_email}")
-
print_info(f" Homepage: {user_homepage}")
-
print_info(f" Icon: {user_icon}")
-
def add_feed(username: str, feed_url: Optional[str], config_file: Path) -> None:
-
"""Add a feed to an existing user."""
-
print_error("Feed URL is required")
-
validated_feed_url = HttpUrl(feed_url)
-
except ValidationError:
-
print_error(f"Invalid feed URL: {feed_url}")
-
raise typer.Exit(1) from None
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
user = git_store.get_user(username)
-
print_error(f"User '{username}' not found")
-
print_error("Use 'thicket add user' to add a new user")
-
# Check if feed already exists
-
if str(validated_feed_url) in user.feeds:
-
print_error(f"Feed already exists for user '{username}': {feed_url}")
-
updated_feeds = user.feeds + [str(validated_feed_url)]
-
if git_store.update_user(username, feeds=updated_feeds):
-
git_store.commit_changes(f"Add feed to user {username}: {feed_url}")
-
print_success(f"Added feed to user '{username}': {feed_url}")
-
print_error(f"Failed to add feed to user '{username}'")
-
async def discover_feed_metadata(feed_url: HttpUrl):
-
"""Discover metadata from a feed URL."""
-
with create_progress() as progress:
-
task = progress.add_task("Discovering feed metadata...", total=None)
-
content = await parser.fetch_feed(feed_url)
-
metadata, _ = parser.parse_feed(content, feed_url)
-
progress.update(task, completed=True)
-
print_error(f"Failed to discover feed metadata: {e}")
-
<file path="src/thicket/cli/commands/duplicates.py">
-
"""Duplicates command for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.table import Table
-
from ...core.git_store import GitStore
-
@app.command("duplicates")
-
def duplicates_command(
-
action: str = typer.Argument(..., help="Action: 'list', 'add', 'remove'"),
-
duplicate_id: Optional[str] = typer.Argument(None, help="Duplicate entry ID"),
-
canonical_id: Optional[str] = typer.Argument(None, help="Canonical entry ID"),
-
config_file: Optional[Path] = typer.Option(
-
Path("thicket.yaml"), "--config", help="Configuration file path"
-
"""Manage duplicate entry mappings."""
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
list_duplicates(git_store)
-
add_duplicate(git_store, duplicate_id, canonical_id)
-
elif action == "remove":
-
remove_duplicate(git_store, duplicate_id)
-
print_error(f"Unknown action: {action}")
-
print_error("Use 'list', 'add', or 'remove'")
-
def list_duplicates(git_store: GitStore) -> None:
-
"""List all duplicate mappings."""
-
duplicates = git_store.get_duplicates()
-
if not duplicates.duplicates:
-
print("No duplicate mappings found")
-
print_info("No duplicate mappings found")
-
print("Duplicate ID\tCanonical ID")
-
for duplicate_id, canonical_id in duplicates.duplicates.items():
-
print(f"{duplicate_id}\t{canonical_id}")
-
print(f"Total duplicates: {len(duplicates.duplicates)}")
-
table = Table(title="Duplicate Entry Mappings")
-
table.add_column("Duplicate ID", style="red")
-
table.add_column("Canonical ID", style="green")
-
for duplicate_id, canonical_id in duplicates.duplicates.items():
-
table.add_row(duplicate_id, canonical_id)
-
print_info(f"Total duplicates: {len(duplicates.duplicates)}")
-
def add_duplicate(git_store: GitStore, duplicate_id: Optional[str], canonical_id: Optional[str]) -> None:
-
"""Add a duplicate mapping."""
-
print_error("Duplicate ID is required")
-
print_error("Canonical ID is required")
-
# Check if duplicate_id already exists
-
duplicates = git_store.get_duplicates()
-
if duplicates.is_duplicate(duplicate_id):
-
existing_canonical = duplicates.get_canonical(duplicate_id)
-
print_error(f"Duplicate ID already mapped to: {existing_canonical}")
-
print_error("Use 'remove' first to change the mapping")
-
# Check if we're trying to make a canonical ID point to itself
-
if duplicate_id == canonical_id:
-
print_error("Duplicate ID cannot be the same as canonical ID")
-
git_store.add_duplicate(duplicate_id, canonical_id)
-
git_store.commit_changes(f"Add duplicate mapping: {duplicate_id} -> {canonical_id}")
-
print_success(f"Added duplicate mapping: {duplicate_id} -> {canonical_id}")
-
def remove_duplicate(git_store: GitStore, duplicate_id: Optional[str]) -> None:
-
"""Remove a duplicate mapping."""
-
print_error("Duplicate ID is required")
-
# Check if mapping exists
-
duplicates = git_store.get_duplicates()
-
if not duplicates.is_duplicate(duplicate_id):
-
print_error(f"No duplicate mapping found for: {duplicate_id}")
-
canonical_id = duplicates.get_canonical(duplicate_id)
-
if git_store.remove_duplicate(duplicate_id):
-
git_store.commit_changes(f"Remove duplicate mapping: {duplicate_id} -> {canonical_id}")
-
print_success(f"Removed duplicate mapping: {duplicate_id} -> {canonical_id}")
-
print_error(f"Failed to remove duplicate mapping: {duplicate_id}")
-
<file path="src/thicket/cli/commands/sync.py">
-
"""Sync command for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.progress import track
-
from ...core.feed_parser import FeedParser
-
from ...core.git_store import GitStore
-
all_users: bool = typer.Option(
-
False, "--all", "-a", help="Sync all users and feeds"
-
user: Optional[str] = typer.Option(
-
None, "--user", "-u", help="Sync specific user only"
-
config_file: Optional[Path] = typer.Option(
-
Path("thicket.yaml"), "--config", help="Configuration file path"
-
dry_run: bool = typer.Option(
-
False, "--dry-run", help="Show what would be synced without making changes"
-
"""Sync feeds and store entries in Git repository."""
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
# Determine which users to sync from git repository
-
index = git_store._load_index()
-
users_to_sync = list(index.users.values())
-
user_metadata = git_store.get_user(user)
-
print_error(f"User '{user}' not found in git repository")
-
users_to_sync = [user_metadata]
-
print_error("Specify --all to sync all users or --user to sync a specific user")
-
print_info("No users configured to sync")
-
total_updated_entries = 0
-
for user_metadata in users_to_sync:
-
print_info(f"Syncing user: {user_metadata.username}")
-
user_updated_entries = 0
-
# Sync each feed for the user
-
for feed_url in track(user_metadata.feeds, description=f"Syncing {user_metadata.username}'s feeds"):
-
new_entries, updated_entries = asyncio.run(
-
sync_feed(git_store, user_metadata.username, feed_url, dry_run)
-
user_new_entries += new_entries
-
user_updated_entries += updated_entries
-
print_error(f"Failed to sync feed {feed_url}: {e}")
-
print_info(f"User {user_metadata.username}: {user_new_entries} new, {user_updated_entries} updated")
-
total_new_entries += user_new_entries
-
total_updated_entries += user_updated_entries
-
# Commit changes if not dry run
-
if not dry_run and (total_new_entries > 0 or total_updated_entries > 0):
-
commit_message = f"Sync feeds: {total_new_entries} new entries, {total_updated_entries} updated"
-
git_store.commit_changes(commit_message)
-
print_success(f"Committed changes: {commit_message}")
-
print_info(f"Dry run complete: would sync {total_new_entries} new entries, {total_updated_entries} updated")
-
print_success(f"Sync complete: {total_new_entries} new entries, {total_updated_entries} updated")
-
async def sync_feed(git_store: GitStore, username: str, feed_url, dry_run: bool) -> tuple[int, int]:
-
"""Sync a single feed for a user."""
-
content = await parser.fetch_feed(feed_url)
-
metadata, entries = parser.parse_feed(content, feed_url)
-
# Check if entry already exists
-
existing_entry = git_store.get_entry(username, entry.id)
-
# Check if entry has been updated
-
if existing_entry.updated != entry.updated:
-
git_store.store_entry(username, entry)
-
git_store.store_entry(username, entry)
-
print_error(f"Failed to process entry {entry.id}: {e}")
-
return new_entries, updated_entries
-
print_error(f"Failed to sync feed {feed_url}: {e}")
-
<file path="src/thicket/models/config.py">
-
"""Configuration models for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from pydantic import BaseModel, EmailStr, HttpUrl
-
from pydantic_settings import BaseSettings, SettingsConfigDict
-
class UserConfig(BaseModel):
-
"""Configuration for a single user and their feeds."""
-
email: Optional[EmailStr] = None
-
homepage: Optional[HttpUrl] = None
-
icon: Optional[HttpUrl] = None
-
display_name: Optional[str] = None
-
class ThicketConfig(BaseSettings):
-
"""Main configuration for thicket."""
-
model_config = SettingsConfigDict(
-
yaml_file="thicket.yaml",
-
users: list[UserConfig] = []
-
<file path="src/thicket/cli/commands/links_cmd.py">
-
"""CLI command for extracting and categorizing all outbound links from blog entries."""
-
from pathlib import Path
-
from typing import Dict, List, Optional, Set
-
from urllib.parse import urljoin, urlparse
-
from rich.console import Console
-
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
-
from rich.table import Table
-
from ...core.git_store import GitStore
-
from ..utils import load_config, get_tsv_mode
-
"""Represents a link found in a blog entry."""
-
def __init__(self, url: str, entry_id: str, username: str):
-
self.entry_id = entry_id
-
self.username = username
-
def to_dict(self) -> dict:
-
"""Convert to dictionary for JSON serialization."""
-
"entry_id": self.entry_id,
-
"username": self.username
-
def from_dict(cls, data: dict) -> "LinkData":
-
"""Create from dictionary."""
-
entry_id=data["entry_id"],
-
username=data["username"]
-
"""Categorizes links as internal, user, or unknown."""
-
def __init__(self, user_domains: Dict[str, Set[str]]):
-
self.user_domains = user_domains
-
# Create reverse mapping of domain -> username
-
self.domain_to_user = {}
-
for username, domains in user_domains.items():
-
self.domain_to_user[domain] = username
-
def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:
-
Categorize a URL as 'internal', 'user', or 'unknown'.
-
Returns (category, target_username).
-
domain = parsed.netloc.lower()
-
# Check if it's a link to the same user's domain (internal)
-
if domain in self.user_domains.get(source_username, set()):
-
return "internal", source_username
-
# Check if it's a link to another user's domain
-
if domain in self.domain_to_user:
-
return "user", self.domain_to_user[domain]
-
# Everything else is unknown
-
"""Extracts and resolves links from blog entries."""
-
# Pattern for extracting links from HTML
-
self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
-
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
-
def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:
-
"""Extract all links from HTML content and resolve them against base URL."""
-
# Extract links from <a> tags
-
for match in self.link_pattern.finditer(html_content):
-
text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
-
# Resolve relative URLs against base URL
-
resolved_url = urljoin(base_url, url)
-
links.append((resolved_url, text))
-
def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:
-
"""Extract all links from a blog entry."""
-
# Combine all text content for analysis
-
content_to_search.append(entry.content)
-
content_to_search.append(entry.summary)
-
for content in content_to_search:
-
extracted_links = self.extract_links_from_html(content, base_url)
-
for url, link_text in extracted_links:
-
if not url or url.startswith('#'):
-
links.append(link_data)
-
config_file: Optional[Path] = typer.Option(
-
help="Path to configuration file",
-
output_file: Optional[Path] = typer.Option(
-
help="Path to output unified links file (default: links.json in git store)",
-
verbose: bool = typer.Option(
-
help="Show detailed progress information",
-
"""Extract and categorize all outbound links from blog entries.
-
This command analyzes all blog entries to extract outbound links,
-
resolve them properly with respect to the feed's base URL, and
-
categorize them as internal, user, or unknown links.
-
Creates a unified links.json file containing all link data.
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
# Build user domain mapping
-
console.print("Building user domain mapping...")
-
index = git_store._load_index()
-
for username, user_metadata in index.users.items():
-
# Add domains from feeds
-
for feed_url in user_metadata.feeds:
-
domain = urlparse(feed_url).netloc.lower()
-
# Add domain from homepage
-
if user_metadata.homepage:
-
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
-
user_domains[username] = domains
-
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
-
# Initialize components
-
link_extractor = LinkExtractor()
-
categorizer = LinkCategorizer(user_domains)
-
users = list(index.users.keys())
-
console.print("[yellow]No users found in Git store[/yellow]")
-
link_categories = {"internal": [], "user": [], "unknown": []}
-
link_dict = {} # Dictionary with link URL as key, maps to list of atom IDs
-
reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs
-
TextColumn("[progress.description]{task.description}"),
-
# Count total entries first
-
counting_task = progress.add_task("Counting entries...", total=len(users))
-
entries = git_store.list_entries(username)
-
total_entries += len(entries)
-
progress.advance(counting_task)
-
progress.remove_task(counting_task)
-
processing_task = progress.add_task(
-
f"Processing {total_entries} entries...",
-
entries = git_store.list_entries(username)
-
user_metadata = index.users[username]
-
# Get base URL for this user (use first feed URL)
-
base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"
-
# Extract links from this entry
-
entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)
-
# Track unique links per entry
-
entry_urls_seen = set()
-
for link_data in entry_links:
-
# Skip if we've already seen this URL in this entry
-
if link_data.url in entry_urls_seen:
-
entry_urls_seen.add(link_data.url)
-
category, target_username = categorizer.categorize_url(link_data.url, username)
-
# Add to link dictionary (URL as key, maps to list of atom IDs)
-
if link_data.url not in link_dict:
-
link_dict[link_data.url] = []
-
if link_data.entry_id not in link_dict[link_data.url]:
-
link_dict[link_data.url].append(link_data.entry_id)
-
# Also add to reverse mapping (atom ID -> list of URLs)
-
if link_data.entry_id not in reverse_dict:
-
reverse_dict[link_data.entry_id] = []
-
if link_data.url not in reverse_dict[link_data.entry_id]:
-
reverse_dict[link_data.entry_id].append(link_data.url)
-
# Add category info to link data for categories tracking
-
link_info = link_data.to_dict()
-
link_info["category"] = category
-
link_info["target_username"] = target_username
-
all_links.append(link_info)
-
link_categories[category].append(link_info)
-
progress.advance(processing_task)
-
if verbose and entry_links:
-
console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...")
-
# Determine output path
-
output_path = output_file
-
output_path = config.git_store / "links.json"
-
# Save all extracted links (not just filtered ones)
-
console.print("Preparing output data...")
-
# Build a set of all URLs that correspond to posts in the git database
-
registered_urls = set()
-
# Get all entries from all users and build URL mappings
-
entries = git_store.list_entries(username)
-
user_metadata = index.users[username]
-
# Try to match entry URLs with extracted links
-
if hasattr(entry, 'link') and entry.link:
-
registered_urls.add(str(entry.link))
-
# Also check entry alternate links if they exist
-
if hasattr(entry, 'links') and entry.links:
-
for link in entry.links:
-
if hasattr(link, 'href') and link.href:
-
registered_urls.add(str(link.href))
-
# Build unified structure with metadata
-
for url, entry_ids in link_dict.items():
-
"referencing_entries": entry_ids
-
# Find target username if this is a tracked post
-
if url in registered_urls:
-
user_domains_set = {domain for domain in user_domains.get(username, [])}
-
if any(domain in url for domain in user_domains_set):
-
unified_links[url]["target_username"] = username
-
# Build reverse mapping
-
for entry_id in entry_ids:
-
if entry_id not in reverse_mapping:
-
reverse_mapping[entry_id] = []
-
if url not in reverse_mapping[entry_id]:
-
reverse_mapping[entry_id].append(url)
-
# Create unified output data
-
"links": unified_links,
-
"reverse_mapping": reverse_mapping,
-
"user_domains": {k: list(v) for k, v in user_domains.items()}
-
console.print(f"Found {len(registered_urls)} registered post URLs")
-
console.print(f"Found {len(link_dict)} total links, {sum(1 for link in unified_links.values() if 'target_username' in link)} tracked posts")
-
with open(output_path, "w") as f:
-
json.dump(output_data, f, indent=2, default=str)
-
console.print("\n[green]โ Links extraction completed successfully[/green]")
-
# Create summary table or TSV output
-
print("Category\tCount\tDescription")
-
print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")
-
print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")
-
print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")
-
print(f"Total Extracted\t{len(all_links)}\tAll extracted links")
-
print(f"Saved to Output\t{len(output_data['links'])}\tLinks saved to output file")
-
print(f"Cross-references\t{sum(1 for link in unified_links.values() if 'target_username' in link)}\tLinks to registered posts only")
-
table = Table(title="Links Summary")
-
table.add_column("Category", style="cyan")
-
table.add_column("Count", style="green")
-
table.add_column("Description", style="white")
-
table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")
-
table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")
-
table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")
-
table.add_row("Total Extracted", str(len(all_links)), "All extracted links")
-
table.add_row("Saved to Output", str(len(output_data['links'])), "Links saved to output file")
-
table.add_row("Cross-references", str(sum(1 for link in unified_links.values() if 'target_username' in link)), "Links to registered posts only")
-
# Show user links if verbose
-
if verbose and link_categories["user"]:
-
print("User Link Source\tUser Link Target\tLink Count")
-
for link in link_categories["user"]:
-
key = f"{link['username']} -> {link['target_username']}"
-
user_link_counts[key] = user_link_counts.get(key, 0) + 1
-
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
-
source, target = link_pair.split(" -> ")
-
print(f"{source}\t{target}\t{count}")
-
console.print("\n[bold]User-to-user links:[/bold]")
-
for link in link_categories["user"]:
-
key = f"{link['username']} -> {link['target_username']}"
-
user_link_counts[key] = user_link_counts.get(key, 0) + 1
-
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
-
console.print(f" {link_pair}: {count} links")
-
console.print(f"\nUnified links data saved to: {output_path}")
-
console.print(f"[red]Error extracting links: {e}[/red]")
-
console.print_exception()
-
<file path="src/thicket/cli/commands/list_cmd.py">
-
"""List command for thicket."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.table import Table
-
from ...core.git_store import GitStore
-
print_feeds_table_from_git,
-
print_users_table_from_git,
-
what: str = typer.Argument(..., help="What to list: 'users', 'feeds', 'entries'"),
-
user: Optional[str] = typer.Option(
-
None, "--user", "-u", help="Filter by specific user"
-
limit: Optional[int] = typer.Option(
-
None, "--limit", "-l", help="Limit number of results"
-
config_file: Optional[Path] = typer.Option(
-
Path("thicket.yaml"), "--config", help="Configuration file path"
-
"""List users, feeds, or entries."""
-
config = load_config(config_file)
-
git_store = GitStore(config.git_store)
-
list_feeds(git_store, user)
-
elif what == "entries":
-
list_entries(git_store, user, limit)
-
print_error(f"Unknown list type: {what}")
-
print_error("Use 'users', 'feeds', or 'entries'")
-
def list_users(git_store: GitStore) -> None:
-
index = git_store._load_index()
-
users = list(index.users.values())
-
print_info("No users configured")
-
print_users_table_from_git(users)
-
def list_feeds(git_store: GitStore, username: Optional[str] = None) -> None:
-
"""List feeds, optionally filtered by user."""
-
user = git_store.get_user(username)
-
print_error(f"User '{username}' not found")
-
print_info(f"No feeds configured for user '{username}'")
-
print_feeds_table_from_git(git_store, username)
-
def list_entries(git_store: GitStore, username: Optional[str] = None, limit: Optional[int] = None) -> None:
-
"""List entries, optionally filtered by user."""
-
# List entries for specific user
-
user = git_store.get_user(username)
-
print_error(f"User '{username}' not found")
-
entries = git_store.list_entries(username, limit)
-
print_info(f"No entries found for user '{username}'")
-
print_entries_table([entries], [username])
-
# List entries for all users
-
index = git_store._load_index()
-
for user in index.users.values():
-
entries = git_store.list_entries(user.username, limit)
-
all_entries.append(entries)
-
all_usernames.append(user.username)
-
print_info("No entries found")
-
print_entries_table(all_entries, all_usernames)
-
def _clean_html_content(content: Optional[str]) -> str:
-
"""Clean HTML content for display in table."""
-
clean_text = re.sub(r'<[^>]+>', ' ', content)
-
# Replace multiple whitespace with single space
-
clean_text = re.sub(r'\s+', ' ', clean_text)
-
# Strip and limit length
-
clean_text = clean_text.strip()
-
if len(clean_text) > 100:
-
clean_text = clean_text[:97] + "..."
-
def print_entries_table(entries_by_user: list[list], usernames: list[str]) -> None:
-
"""Print a table of entries."""
-
print_entries_tsv(entries_by_user, usernames)
-
table = Table(title="Feed Entries")
-
table.add_column("User", style="cyan", no_wrap=True)
-
table.add_column("Title", style="bold")
-
table.add_column("Updated", style="blue")
-
table.add_column("URL", style="green")
-
# Combine all entries with usernames
-
for entries, username in zip(entries_by_user, usernames):
-
all_entries.append((username, entry))
-
# Sort by updated time (newest first)
-
all_entries.sort(key=lambda x: x[1].updated, reverse=True)
-
for username, entry in all_entries:
-
updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
-
# Truncate title if too long
-
title = title[:47] + "..."
-
<file path="src/thicket/cli/main.py">
-
"""Main CLI application using Typer."""
-
from rich.console import Console
-
from .. import __version__
-
help="A CLI tool for persisting Atom/RSS feeds in Git repositories",
-
rich_markup_mode="rich",
-
# Global state for TSV output mode
-
def version_callback(value: bool) -> None:
-
"""Show version and exit."""
-
console.print(f"thicket version {__version__}")
-
version: bool = typer.Option(
-
help="Show the version and exit",
-
callback=version_callback,
-
tsv: bool = typer.Option(
-
help="Output in tab-separated values format without truncation",
-
"""Thicket: A CLI tool for persisting Atom/RSS feeds in Git repositories."""
-
# Import commands to register them
-
from .commands import add, duplicates, generate, index_cmd, info_cmd, init, links_cmd, list_cmd, sync
-
if __name__ == "__main__":
-
<file path="src/thicket/core/git_store.py">
-
"""Git repository operations for thicket."""
-
from datetime import datetime
-
from pathlib import Path
-
from typing import Optional
-
from ..models import AtomEntry, DuplicateMap, GitStoreIndex, UserMetadata
-
"""Manages the Git repository for storing feed entries."""
-
def __init__(self, repo_path: Path):
-
"""Initialize the Git store."""
-
self.repo_path = repo_path
-
self.repo: Optional[Repo] = None
-
def _ensure_repo(self) -> None:
-
"""Ensure the Git repository exists and is initialized."""
-
if not self.repo_path.exists():
-
self.repo_path.mkdir(parents=True, exist_ok=True)
-
self.repo = Repo(self.repo_path)
-
except git.InvalidGitRepositoryError:
-
# Initialize new repository
-
self.repo = Repo.init(self.repo_path)
-
self._create_initial_structure()
-
def _create_initial_structure(self) -> None:
-
"""Create initial Git store structure."""
-
created=datetime.now(),
-
last_updated=datetime.now(),
-
self._save_index(index)
-
# Create duplicates.json
-
duplicates = DuplicateMap()
-
self._save_duplicates(duplicates)
-
# Create initial commit
-
self.repo.index.add(["index.json", "duplicates.json"])
-
self.repo.index.commit("Initial thicket repository structure")
-
def _save_index(self, index: GitStoreIndex) -> None:
-
"""Save the index to index.json."""
-
index_path = self.repo_path / "index.json"
-
with open(index_path, "w") as f:
-
json.dump(index.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
-
def _load_index(self) -> GitStoreIndex:
-
"""Load the index from index.json."""
-
index_path = self.repo_path / "index.json"
-
if not index_path.exists():
-
created=datetime.now(),
-
last_updated=datetime.now(),
-
with open(index_path) as f:
-
return GitStoreIndex(**data)
-
def _save_duplicates(self, duplicates: DuplicateMap) -> None:
-
"""Save duplicates map to duplicates.json."""
-
duplicates_path = self.repo_path / "duplicates.json"
-
with open(duplicates_path, "w") as f:
-
json.dump(duplicates.model_dump(exclude_none=True), f, indent=2)
-
def _load_duplicates(self) -> DuplicateMap:
-
"""Load duplicates map from duplicates.json."""
-
duplicates_path = self.repo_path / "duplicates.json"
-
if not duplicates_path.exists():
-
with open(duplicates_path) as f:
-
return DuplicateMap(**data)
-
def add_user(self, username: str, display_name: Optional[str] = None,
-
email: Optional[str] = None, homepage: Optional[str] = None,
-
icon: Optional[str] = None, feeds: Optional[list[str]] = None) -> UserMetadata:
-
"""Add a new user to the Git store."""
-
index = self._load_index()
-
# Create user directory
-
user_dir = self.repo_path / username
-
user_dir.mkdir(exist_ok=True)
-
user_metadata = UserMetadata(
-
display_name=display_name,
-
created=datetime.now(),
-
last_updated=datetime.now(),
-
index.add_user(user_metadata)
-
self._save_index(index)
-
def get_user(self, username: str) -> Optional[UserMetadata]:
-
"""Get user metadata by username."""
-
index = self._load_index()
-
return index.get_user(username)
-
def update_user(self, username: str, **kwargs) -> bool:
-
"""Update user metadata."""
-
index = self._load_index()
-
user = index.get_user(username)
-
for key, value in kwargs.items():
-
if hasattr(user, key) and value is not None:
-
setattr(user, key, value)
-
user.update_timestamp()
-
self._save_index(index)
-
def store_entry(self, username: str, entry: AtomEntry) -> bool:
-
"""Store an entry in the user's directory."""
-
user = self.get_user(username)
-
# Sanitize entry ID for filename
-
from .feed_parser import FeedParser
-
safe_id = parser.sanitize_entry_id(entry.id)
-
user_dir = self.repo_path / user.directory
-
entry_path = user_dir / f"{safe_id}.json"
-
# Check if entry already exists
-
entry_exists = entry_path.exists()
-
with open(entry_path, "w") as f:
-
json.dump(entry.model_dump(mode="json", exclude_none=True), f, indent=2, default=str)
-
# Update user metadata if new entry
-
index = self._load_index()
-
index.update_entry_count(username, 1)
-
self._save_index(index)
-
def get_entry(self, username: str, entry_id: str) -> Optional[AtomEntry]:
-
"""Get an entry by username and entry ID."""
-
user = self.get_user(username)
-
from .feed_parser import FeedParser
-
safe_id = parser.sanitize_entry_id(entry_id)
-
entry_path = self.repo_path / user.directory / f"{safe_id}.json"
-
if not entry_path.exists():
-
with open(entry_path) as f:
-
return AtomEntry(**data)
-
def list_entries(self, username: str, limit: Optional[int] = None) -> list[AtomEntry]:
-
"""List entries for a user."""
-
user = self.get_user(username)
-
user_dir = self.repo_path / user.directory
-
if not user_dir.exists():
-
entry_files = sorted(user_dir.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
-
entry_files = entry_files[:limit]
-
for entry_file in entry_files:
-
with open(entry_file) as f:
-
entries.append(AtomEntry(**data))
-
def get_duplicates(self) -> DuplicateMap:
-
"""Get the duplicates map."""
-
return self._load_duplicates()
-
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
-
"""Add a duplicate mapping."""
-
duplicates = self._load_duplicates()
-
duplicates.add_duplicate(duplicate_id, canonical_id)
-
self._save_duplicates(duplicates)
-
def remove_duplicate(self, duplicate_id: str) -> bool:
-
"""Remove a duplicate mapping."""
-
duplicates = self._load_duplicates()
-
result = duplicates.remove_duplicate(duplicate_id)
-
self._save_duplicates(duplicates)
-
def commit_changes(self, message: str) -> None:
-
"""Commit all changes to the Git repository."""
-
self.repo.git.add(A=True)
-
# Check if there are changes to commit
-
if self.repo.index.diff("HEAD"):
-
self.repo.index.commit(message)
-
def get_stats(self) -> dict:
-
"""Get statistics about the Git store."""
-
index = self._load_index()
-
duplicates = self._load_duplicates()
-
"total_users": len(index.users),
-
"total_entries": index.total_entries,
-
"total_duplicates": len(duplicates.duplicates),
-
"last_updated": index.last_updated,
-
"repository_size": sum(f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()),
-
def search_entries(self, query: str, username: Optional[str] = None,
-
limit: Optional[int] = None) -> list[tuple[str, AtomEntry]]:
-
"""Search entries by content."""
-
index = self._load_index()
-
users = [index.get_user(username)] if username else list(index.users.values())
-
users = [u for u in users if u is not None]
-
user_dir = self.repo_path / user.directory
-
if not user_dir.exists():
-
entry_files = user_dir.glob("*.json")
-
for entry_file in entry_files:
-
with open(entry_file) as f:
-
entry = AtomEntry(**data)
-
# Simple text search in title, summary, and content
-
searchable_text = " ".join(filter(None, [
-
if query.lower() in searchable_text:
-
results.append((user.username, entry))
-
if limit and len(results) >= limit:
-
# Sort by updated time (newest first)
-
results.sort(key=lambda x: x[1].updated, reverse=True)
-
return results[:limit] if limit else results
-
# Thicket Architecture Design
-
Thicket is a modern CLI tool for persisting Atom/RSS feeds in a Git repository, designed to enable distributed webblog comment structures.
-
- **Typer** (0.15.x) - Modern CLI framework with type hints
-
- **Rich** (13.x) - Beautiful terminal output, progress bars, and tables
-
- **prompt-toolkit** - Interactive prompts when needed
-
- **feedparser** (6.0.11) - Universal feed parser supporting RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0
-
- Alternative: **atoma** for stricter Atom/RSS parsing with JSON feed support
-
- Alternative: **fastfeedparser** for high-performance parsing (10x faster)
-
- **GitPython** (3.1.44) - High-level git operations, requires git CLI
-
- Alternative: **pygit2** (1.18.0) - Direct libgit2 bindings, better for authentication
-
- **httpx** (0.28.x) - Modern async/sync HTTP client with connection pooling
-
- **aiohttp** (3.11.x) - For async-only operations if needed
-
#### Configuration & Data Models
-
- **pydantic** (2.11.x) - Data validation and settings management
-
- **pydantic-settings** (2.10.x) - Configuration file handling with env var support
-
- **pendulum** (3.x) - Better datetime handling
-
- **bleach** (6.x) - HTML sanitization for feed content
-
- **platformdirs** (4.x) - Cross-platform directory paths
-
โโโ pyproject.toml # Modern Python packaging
-
โโโ README.md # Project documentation
-
โโโ ARCH.md # This file
-
โโโ CLAUDE.md # Project instructions
-
โ โโโ __init__.py
-
โ โโโ __main__.py # Entry point for `python -m thicket`
-
โ โโโ cli/ # CLI commands and interface
-
โ โ โโโ __init__.py
-
โ โ โโโ main.py # Main CLI app with Typer
-
โ โ โโโ commands/ # Subcommands
-
โ โ โ โโโ __init__.py
-
โ โ โ โโโ init.py # Initialize git store
-
โ โ โ โโโ add.py # Add users and feeds
-
โ โ โ โโโ sync.py # Sync feeds
-
โ โ โ โโโ list_cmd.py # List users/feeds
-
โ โ โ โโโ duplicates.py # Manage duplicate entries
-
โ โ โ โโโ links_cmd.py # Extract and categorize links
-
โ โ โ โโโ index_cmd.py # Build reference index and show threads
-
โ โ โโโ utils.py # CLI utilities (progress, formatting)
-
โ โโโ core/ # Core business logic
-
โ โ โโโ __init__.py
-
โ โ โโโ feed_parser.py # Feed parsing and normalization
-
โ โ โโโ git_store.py # Git repository operations
-
โ โ โโโ reference_parser.py # Link extraction and threading
-
โ โโโ models/ # Pydantic data models
-
โ โ โโโ __init__.py
-
โ โ โโโ config.py # Configuration models
-
โ โ โโโ feed.py # Feed/Entry models
-
โ โ โโโ user.py # User metadata models
-
โ โโโ utils/ # Shared utilities
-
โ โโโ __init__.py
-
โ โโโ __init__.py
-
โ โโโ conftest.py # pytest configuration
-
โ โโโ test_feed_parser.py
-
โ โโโ test_git_store.py
-
โ โโโ fixtures/ # Test data
-
โโโ examples/ # Example configurations
-
### Configuration File (YAML/TOML)
-
class ThicketConfig(BaseSettings):
-
git_store: Path # Git repository location
-
cache_dir: Path # Cache directory
-
users: list[UserConfig]
-
model_config = SettingsConfigDict(
-
yaml_file="thicket.yaml"
-
class UserConfig(BaseModel):
-
email: Optional[EmailStr] = None
-
homepage: Optional[HttpUrl] = None
-
icon: Optional[HttpUrl] = None
-
display_name: Optional[str] = None
-
### Feed Storage Format
-
class AtomEntry(BaseModel):
-
id: str # Original Atom ID
-
published: Optional[datetime]
-
content: Optional[str] # Full body content from Atom entry
-
content_type: Optional[str] = "html" # text, html, xhtml
-
categories: list[str] = []
-
rights: Optional[str] = None # Copyright info
-
source: Optional[str] = None # Source feed URL
-
# Additional Atom fields preserved during RSS->Atom conversion
-
model_config = ConfigDict(
-
datetime: lambda v: v.isoformat()
-
class DuplicateMap(BaseModel):
-
"""Maps duplicate entry IDs to canonical entry IDs"""
-
duplicates: dict[str, str] = {} # duplicate_id -> canonical_id
-
comment: str = "Entry IDs that map to the same canonical content"
-
def add_duplicate(self, duplicate_id: str, canonical_id: str) -> None:
-
"""Add a duplicate mapping"""
-
self.duplicates[duplicate_id] = canonical_id
-
def remove_duplicate(self, duplicate_id: str) -> bool:
-
"""Remove a duplicate mapping. Returns True if existed."""
-
return self.duplicates.pop(duplicate_id, None) is not None
-
def get_canonical(self, entry_id: str) -> str:
-
"""Get canonical ID for an entry (returns original if not duplicate)"""
-
return self.duplicates.get(entry_id, entry_id)
-
def is_duplicate(self, entry_id: str) -> bool:
-
"""Check if entry ID is marked as duplicate"""
-
return entry_id in self.duplicates
-
## Git Repository Structure
-
โโโ index.json # User directory index
-
โโโ duplicates.json # Manual curation of duplicate entries
-
โโโ links.json # Unified links, references, and mapping data
-
โ โโโ entry_id_1.json # Sanitized entry files
-
โ โโโ entry_id_2.json
-
## Key Design Decisions
-
### 1. Feed Normalization & Auto-Discovery
-
- All RSS feeds converted to Atom format before storage
-
- Preserves maximum metadata during conversion
-
- Sanitizes HTML content to prevent XSS
-
- **Auto-discovery**: Extracts user metadata from feed during `add user` command
-
- Consistent algorithm to convert Atom IDs to safe filenames
-
- Handles edge cases (very long IDs, special characters)
-
- Maintains reversibility where possible
-
- Uses GitPython for simplicity (no authentication required)
-
- Single main branch for all users and entries
-
- Atomic commits per sync operation
-
- Meaningful commit messages with feed update summaries
-
- Preserves complete history - never delete entries even if they disappear from feeds
-
### 4. Caching Strategy
-
- HTTP caching with Last-Modified/ETag support
-
- Local cache of parsed feeds with TTL
-
- Cache invalidation on configuration changes
-
- Git store serves as permanent historical archive beyond feed depth limits
-
- Graceful handling of feed parsing errors
-
- Retry logic for network failures
-
- Clear error messages with recovery suggestions
-
## CLI Command Structure
-
# Initialize a new git store
-
thicket init /path/to/store
-
# Add a user with feeds (auto-discovers metadata from feed)
-
thicket add user "alyssa" \
-
--feed "https://example.com/feed.atom"
-
# Auto-populates: email, homepage, icon, display_name from feed metadata
-
# Add a user with manual overrides
-
thicket add user "alyssa" \
-
--feed "https://example.com/feed.atom" \
-
--email "alyssa@example.com" \
-
--homepage "https://alyssa.example.com" \
-
--icon "https://example.com/avatar.png" \
-
--display-name "Alyssa P. Hacker"
-
# Add additional feed to existing user
-
thicket add feed "alyssa" "https://example.com/other-feed.rss"
-
# Sync all feeds (designed for cron usage)
-
thicket sync --user alyssa
-
# List users and their feeds
-
thicket list feeds --user alyssa
-
# Manage duplicate entries
-
thicket duplicates list
-
thicket duplicates add <entry_id_1> <entry_id_2> # Mark as duplicates
-
thicket duplicates remove <entry_id_1> <entry_id_2> # Unmark duplicates
-
# Link processing and threading
-
thicket links --verbose # Extract and categorize all links
-
thicket index --verbose # Build reference index for threading
-
thicket threads # Show conversation threads
-
thicket threads --username user1 # Show threads for specific user
-
thicket threads --min-size 3 # Show threads with minimum size
-
## Performance Considerations
-
1. **Concurrent Feed Fetching**: Use httpx with asyncio for parallel downloads
-
2. **Incremental Updates**: Only fetch/parse feeds that have changed
-
3. **Efficient Git Operations**: Batch commits, use shallow clones where appropriate
-
4. **Progress Feedback**: Rich progress bars for long operations
-
## Security Considerations
-
1. **HTML Sanitization**: Use bleach to clean feed content
-
2. **URL Validation**: Strict validation of feed URLs
-
3. **Git Security**: No credentials stored in repository
-
4. **Path Traversal**: Careful sanitization of filenames
-
1. **Web Interface**: Optional web UI for browsing the git store
-
2. **Webhooks**: Notify external services on feed updates
-
3. **Feed Discovery**: Auto-discover feeds from HTML pages
-
4. **Export Formats**: Generate static sites, OPML exports
-
5. **Federation**: P2P sync between thicket instances
-
## Requirements Clarification
-
**โ Resolved Requirements:**
-
1. **Feed Update Frequency**: Designed for cron usage - no built-in scheduling needed
-
2. **Duplicate Handling**: Manual curation via `duplicates.json` file with CLI commands
-
3. **Git Branching**: Single main branch for all users and entries
-
4. **Authentication**: No feeds require authentication currently
-
5. **Content Storage**: Store complete Atom entry body content as provided
-
6. **Deleted Entries**: Preserve all entries in Git store permanently (historical archive)
-
7. **History Depth**: Git store maintains full history beyond feed depth limits
-
8. **Feed Auto-Discovery**: Extract user metadata from feed during `add user` command
-
## Duplicate Entry Management
-
### Duplicate Detection Strategy
-
- **Manual Curation**: Duplicates identified and managed manually via CLI
-
- **Storage**: `duplicates.json` file in Git root maps entry IDs to canonical entries
-
- **Structure**: `{"duplicate_id": "canonical_id", ...}`
-
- **CLI Commands**: Add/remove duplicate mappings with validation
-
- **Query Resolution**: Search/list commands resolve duplicates to canonical entries
-
### Duplicate File Format
-
"https://example.com/feed/entry/123": "https://canonical.com/posts/same-post",
-
"https://mirror.com/articles/456": "https://canonical.com/posts/same-post",
-
"comment": "Entry IDs that map to the same canonical content"
-
## Feed Metadata Auto-Discovery
-
### Extraction Strategy
-
When adding a new user with `thicket add user`, the system fetches and parses the feed to extract:
-
- **Display Name**: From `feed.title` or `feed.author.name`
-
- **Email**: From `feed.author.email` or `feed.managingEditor`
-
- **Homepage**: From `feed.link` or `feed.author.uri`
-
- **Icon**: From `feed.logo`, `feed.icon`, or `feed.image.url`
-
### Discovery Priority Order
-
1. **Author Information**: Prefer `feed.author.*` fields (more specific to person)
-
2. **Feed-Level**: Fall back to feed-level metadata
-
3. **Manual Override**: CLI flags always take precedence over discovered values
-
4. **Update Behavior**: Auto-discovery only runs during initial `add user`, not on sync
-
### Extracted Metadata Format
-
class FeedMetadata(BaseModel):
-
title: Optional[str] = None
-
author_name: Optional[str] = None
-
author_email: Optional[EmailStr] = None
-
author_uri: Optional[HttpUrl] = None
-
link: Optional[HttpUrl] = None
-
logo: Optional[HttpUrl] = None
-
icon: Optional[HttpUrl] = None
-
image_url: Optional[HttpUrl] = None
-
def to_user_config(self, username: str, feed_url: HttpUrl) -> UserConfig:
-
"""Convert discovered metadata to UserConfig with fallbacks"""
-
display_name=self.author_name or self.title,
-
email=self.author_email,
-
homepage=self.author_uri or self.link,
-
icon=self.logo or self.icon or self.image_url
-
## Link Processing and Threading Architecture
-
The thicket system implements a sophisticated link processing and threading system to create email-style threaded views of blog entries by tracking cross-references between different blogs.
-
### Link Processing Pipeline
-
#### 1. Link Extraction (`thicket links`)
-
The `links` command systematically extracts all outbound links from blog entries and categorizes them:
-
class LinkData(BaseModel):
-
url: str # Fully resolved URL
-
entry_id: str # Source entry ID
-
username: str # Source username
-
context: str # Surrounding text context
-
category: str # "internal", "user", or "unknown"
-
target_username: Optional[str] # Target user if applicable
-
- **Internal**: Links to the same user's domain (self-references)
-
- **User**: Links to other tracked users' domains
-
- **Unknown**: Links to external sites not tracked by thicket
-
All links are properly resolved using the Atom feed's base URL to handle:
-
- Relative URLs (converted to absolute)
-
- Protocol-relative URLs
-
- Redirects and canonical URLs
-
The system builds a comprehensive domain mapping from user configuration:
-
- Feed URLs โ domain extraction
-
- Homepage URLs โ domain extraction
-
- Reverse mapping: domain โ username
-
#### 1. Reference Index Generation (`thicket index`)
-
Creates a bidirectional reference index from the categorized links:
-
class BlogReference(BaseModel):
-
target_username: Optional[str]
-
target_entry_id: Optional[str]
-
#### 2. Thread Detection Algorithm
-
Uses graph traversal to find connected blog entries:
-
- **Outbound references**: Links from an entry to other entries
-
- **Inbound references**: Links to an entry from other entries
-
- **Thread members**: All entries connected through references
-
#### 3. Threading Display (`thicket threads`)
-
Creates email-style threaded views:
-
- Chronological ordering within threads
-
- Reference counts (outbound/inbound)
-
- Filtering options (user, entry, minimum size)
-
#### links.json Format (Unified Structure)
-
"https://example.com/post/123": {
-
"referencing_entries": ["https://blog.user.com/entry/456"],
-
"target_username": "user2"
-
"https://external-site.com/article": {
-
"referencing_entries": ["https://blog.user.com/entry/789"]
-
"https://blog.user.com/entry/456": ["https://example.com/post/123"],
-
"https://blog.user.com/entry/789": ["https://external-site.com/article"]
-
"source_entry_id": "https://blog.user.com/entry/456",
-
"source_username": "user1",
-
"target_url": "https://example.com/post/123",
-
"target_username": "user2",
-
"target_entry_id": "https://example.com/post/123",
-
"context": "As mentioned in this post..."
-
"user1": ["blog.user.com"],
-
"user2": ["example.com"]
-
This unified structure eliminates duplication by:
-
- Storing each URL only once with minimal metadata
-
- Including all link data, reference data, and mappings in one file
-
- Using presence of `target_username` to identify tracked vs external links
-
- Providing bidirectional mappings for efficient queries
-
### Unified Structure Benefits
-
- **Eliminates Duplication**: Each URL appears only once with metadata
-
- **Single Source of Truth**: All link-related data in one file
-
- **Efficient Queries**: Fast lookups for both directions (URLโentries, entryโURLs)
-
- **Atomic Updates**: All link data changes together
-
- **Reduced I/O**: Fewer file operations
-
### Implementation Benefits
-
1. **Systematic Link Processing**: All links are extracted and categorized consistently
-
2. **Proper URL Resolution**: Handles relative URLs and base URL resolution correctly
-
3. **Domain-based Categorization**: Automatically identifies user-to-user references
-
4. **Bidirectional Indexing**: Supports both "who links to whom" and "who is linked by whom"
-
5. **Thread Discovery**: Finds conversation threads automatically
-
6. **Rich Context**: Preserves surrounding text for each link
-
7. **Performance**: Pre-computed indexes for fast threading queries
-
# Extract and categorize all links
-
thicket links --verbose
-
# Build reference index for threading
-
thicket index --verbose
-
# Show all conversation threads
-
# Show threads for specific user
-
thicket threads --username user1
-
# Show threads with minimum size
-
thicket threads --min-size 3
-
### Integration with Existing Commands
-
The link processing system integrates seamlessly with existing thicket commands:
-
- `thicket sync` updates entries, requiring `thicket links` to be run afterward
-
- `thicket index` uses the output from `thicket links` for improved accuracy
-
- `thicket threads` provides the user-facing threading interface
-
## Current Implementation Status
-
### โ
Completed Features
-
1. **Core Infrastructure**
-
- Modern CLI with Typer and Rich
-
- Pydantic data models for type safety
-
- Git repository operations with GitPython
-
- Feed parsing and normalization with feedparser
-
2. **User and Feed Management**
-
- `thicket init` - Initialize git store
-
- `thicket add` - Add users and feeds with auto-discovery
-
- `thicket sync` - Sync feeds with progress tracking
-
- `thicket list` - List users, feeds, and entries
-
- `thicket duplicates` - Manage duplicate entries
-
3. **Link Processing and Threading**
-
- `thicket links` - Extract and categorize all outbound links
-
- `thicket index` - Build reference index from links
-
- `thicket threads` - Display threaded conversation views
-
- Proper URL resolution with base URL handling
-
- Domain-based link categorization
-
- Context preservation for links
-
### ๐ System Performance
-
- **Link Extraction**: Successfully processes thousands of blog entries
-
- **Categorization**: Identifies internal, user, and unknown links
-
- **Threading**: Creates email-style threaded views of conversations
-
- **Storage**: Efficient JSON-based data structures for links and references
-
### ๐ง Current Architecture Highlights
-
- **Modular Design**: Clear separation between CLI, core logic, and models
-
- **Type Safety**: Comprehensive Pydantic models for data validation
-
- **Rich CLI**: Beautiful progress bars, tables, and error handling
-
- **Extensible**: Easy to add new commands and features
-
- **Git Integration**: All data stored in version-controlled JSON files
-
### ๐ฏ Proven Functionality
-
The system has been tested with real blog data and successfully:
-
- Extracted 14,396 total links from blog entries
-
- Categorized 3,994 internal links, 363 user-to-user links, and 10,039 unknown links
-
- Built comprehensive domain mappings for 16 users across 20 domains
-
- Generated threaded views showing blog conversation patterns
-
The thicket system is now fully functional for:
-
- Maintaining Git repositories of blog feeds
-
- Tracking cross-references between blogs
-
- Creating threaded views of blog conversations
-
- Discovering blog interaction patterns
-
- Building distributed comment systems
-
<file path="src/thicket/cli/utils.py">
-
"""CLI utilities and helpers."""
-
from pathlib import Path
-
from typing import Optional
-
from rich.console import Console
-
from rich.progress import Progress, SpinnerColumn, TextColumn
-
from rich.table import Table
-
from ..models import ThicketConfig, UserMetadata
-
from ..core.git_store import GitStore
-
def get_tsv_mode() -> bool:
-
"""Get the global TSV mode setting."""
-
from .main import tsv_mode
-
def load_config(config_path: Optional[Path] = None) -> ThicketConfig:
-
"""Load thicket configuration from file or environment."""
-
if config_path and config_path.exists():
-
with open(config_path) as f:
-
config_data = yaml.safe_load(f)
-
# Convert to ThicketConfig
-
return ThicketConfig(**config_data)
-
# Try to load from default locations or environment
-
# First try to find thicket.yaml in current directory
-
default_config = Path("thicket.yaml")
-
if default_config.exists():
-
with open(default_config) as f:
-
config_data = yaml.safe_load(f)
-
return ThicketConfig(**config_data)
-
# Fall back to environment variables
-
console.print(f"[red]Error loading configuration: {e}[/red]")
-
console.print("[yellow]Run 'thicket init' to create a new configuration.[/yellow]")
-
raise typer.Exit(1) from e
-
def save_config(config: ThicketConfig, config_path: Path) -> None:
-
"""Save thicket configuration to file."""
-
config_data = config.model_dump(mode="json", exclude_none=True)
-
# Convert Path objects to strings for YAML serialization
-
config_data["git_store"] = str(config_data["git_store"])
-
config_data["cache_dir"] = str(config_data["cache_dir"])
-
with open(config_path, "w") as f:
-
yaml.dump(config_data, f, default_flow_style=False, sort_keys=False)
-
def create_progress() -> Progress:
-
"""Create a Rich progress display."""
-
TextColumn("[progress.description]{task.description}"),
-
def print_users_table(config: ThicketConfig) -> None:
-
"""Print a table of users and their feeds."""
-
print_users_tsv(config)
-
table = Table(title="Users and Feeds")
-
table.add_column("Username", style="cyan", no_wrap=True)
-
table.add_column("Display Name", style="magenta")
-
table.add_column("Email", style="blue")
-
table.add_column("Homepage", style="green")
-
table.add_column("Feeds", style="yellow")
-
for user in config.users:
-
feeds_str = "\n".join(str(feed) for feed in user.feeds)
-
user.display_name or "",
-
str(user.homepage) if user.homepage else "",
-
def print_feeds_table(config: ThicketConfig, username: Optional[str] = None) -> None:
-
"""Print a table of feeds, optionally filtered by username."""
-
print_feeds_tsv(config, username)
-
table = Table(title=f"Feeds{f' for {username}' if username else ''}")
-
table.add_column("Username", style="cyan", no_wrap=True)
-
table.add_column("Feed URL", style="blue")
-
table.add_column("Status", style="green")
-
users = [config.find_user(username)] if username else config.users
-
users = [u for u in users if u is not None]
-
for feed in user.feeds:
-
"Active", # TODO: Add actual status checking
-
def confirm_action(message: str, default: bool = False) -> bool:
-
"""Prompt for confirmation."""
-
return typer.confirm(message, default=default)
-
def print_success(message: str) -> None:
-
"""Print a success message."""
-
console.print(f"[green]โ[/green] {message}")
-
def print_error(message: str) -> None:
-
"""Print an error message."""
-
console.print(f"[red]โ[/red] {message}")
-
def print_warning(message: str) -> None:
-
"""Print a warning message."""
-
console.print(f"[yellow]โ [/yellow] {message}")
-
def print_info(message: str) -> None:
-
"""Print an info message."""
-
console.print(f"[blue]โน[/blue] {message}")
-
def print_users_table_from_git(users: list[UserMetadata]) -> None:
-
"""Print a table of users from git repository."""
-
print_users_tsv_from_git(users)
-
table = Table(title="Users and Feeds")
-
table.add_column("Username", style="cyan", no_wrap=True)
-
table.add_column("Display Name", style="magenta")
-
table.add_column("Email", style="blue")
-
table.add_column("Homepage", style="green")
-
table.add_column("Feeds", style="yellow")
-
feeds_str = "\n".join(user.feeds)
-
user.display_name or "",
-
def print_feeds_table_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
-
"""Print a table of feeds from git repository."""
-
print_feeds_tsv_from_git(git_store, username)
-
table = Table(title=f"Feeds{f' for {username}' if username else ''}")
-
table.add_column("Username", style="cyan", no_wrap=True)
-
table.add_column("Feed URL", style="blue")
-
table.add_column("Status", style="green")
-
user = git_store.get_user(username)
-
users = [user] if user else []
-
index = git_store._load_index()
-
users = list(index.users.values())
-
for feed in user.feeds:
-
"Active", # TODO: Add actual status checking
-
def print_users_tsv(config: ThicketConfig) -> None:
-
"""Print users in TSV format."""
-
print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
-
for user in config.users:
-
feeds_str = ",".join(str(feed) for feed in user.feeds)
-
print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
-
def print_users_tsv_from_git(users: list[UserMetadata]) -> None:
-
"""Print users from git repository in TSV format."""
-
print("Username\tDisplay Name\tEmail\tHomepage\tFeeds")
-
feeds_str = ",".join(user.feeds)
-
print(f"{user.username}\t{user.display_name or ''}\t{user.email or ''}\t{user.homepage or ''}\t{feeds_str}")
-
def print_feeds_tsv(config: ThicketConfig, username: Optional[str] = None) -> None:
-
"""Print feeds in TSV format."""
-
print("Username\tFeed URL\tStatus")
-
users = [config.find_user(username)] if username else config.users
-
users = [u for u in users if u is not None]
-
for feed in user.feeds:
-
print(f"{user.username}\t{feed}\tActive")
-
def print_feeds_tsv_from_git(git_store: GitStore, username: Optional[str] = None) -> None:
-
"""Print feeds from git repository in TSV format."""
-
print("Username\tFeed URL\tStatus")
-
user = git_store.get_user(username)
-
users = [user] if user else []
-
index = git_store._load_index()
-
users = list(index.users.values())
-
for feed in user.feeds:
-
print(f"{user.username}\t{feed}\tActive")
-
def print_entries_tsv(entries_by_user: list[list], usernames: list[str]) -> None:
-
"""Print entries in TSV format."""
-
print("User\tAtom ID\tTitle\tUpdated\tURL")
-
# Combine all entries with usernames
-
for entries, username in zip(entries_by_user, usernames):
-
all_entries.append((username, entry))
-
# Sort by updated time (newest first)
-
all_entries.sort(key=lambda x: x[1].updated, reverse=True)
-
for username, entry in all_entries:
-
updated_str = entry.updated.strftime("%Y-%m-%d %H:%M")
-
# Escape tabs and newlines in title to preserve TSV format
-
title = entry.title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
-
print(f"{username}\t{entry.id}\t{title}\t{updated_str}\t{entry.link}")