···
+
"""CLI command for extracting and categorizing all outbound links from blog entries."""
+
from pathlib import Path
+
from typing import Dict, List, Optional, Set
+
from urllib.parse import urljoin, urlparse
+
from rich.console import Console
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
+
from rich.table import Table
+
from ...core.git_store import GitStore
+
from ..utils import load_config, get_tsv_mode
+
"""Represents a link found in a blog entry."""
+
def __init__(self, url: str, entry_id: str, username: str):
+
self.entry_id = entry_id
+
self.username = username
+
def to_dict(self) -> dict:
+
"""Convert to dictionary for JSON serialization."""
+
"entry_id": self.entry_id,
+
"username": self.username
+
def from_dict(cls, data: dict) -> "LinkData":
+
"""Create from dictionary."""
+
entry_id=data["entry_id"],
+
username=data["username"]
+
"""Categorizes links as internal, user, or unknown."""
+
def __init__(self, user_domains: Dict[str, Set[str]]):
+
self.user_domains = user_domains
+
# Create reverse mapping of domain -> username
+
self.domain_to_user = {}
+
for username, domains in user_domains.items():
+
self.domain_to_user[domain] = username
+
def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:
+
Categorize a URL as 'internal', 'user', or 'unknown'.
+
Returns (category, target_username).
+
domain = parsed.netloc.lower()
+
# Check if it's a link to the same user's domain (internal)
+
if domain in self.user_domains.get(source_username, set()):
+
return "internal", source_username
+
# Check if it's a link to another user's domain
+
if domain in self.domain_to_user:
+
return "user", self.domain_to_user[domain]
+
# Everything else is unknown
+
"""Extracts and resolves links from blog entries."""
+
# Pattern for extracting links from HTML
+
self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
+
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
+
def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:
+
"""Extract all links from HTML content and resolve them against base URL."""
+
# Extract links from <a> tags
+
for match in self.link_pattern.finditer(html_content):
+
text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
+
# Resolve relative URLs against base URL
+
resolved_url = urljoin(base_url, url)
+
links.append((resolved_url, text))
+
def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:
+
"""Extract all links from a blog entry."""
+
# Combine all text content for analysis
+
content_to_search.append(entry.content)
+
content_to_search.append(entry.summary)
+
for content in content_to_search:
+
extracted_links = self.extract_links_from_html(content, base_url)
+
for url, link_text in extracted_links:
+
if not url or url.startswith('#'):
+
links.append(link_data)
+
config_file: Optional[Path] = typer.Option(
+
help="Path to configuration file",
+
output_file: Optional[Path] = typer.Option(
+
help="Path to output links file (default: links.json in git store)",
+
mapping_file: Optional[Path] = typer.Option(
+
help="Path to output URL <-> atom ID mapping file (default: url_mapping.json in git store)",
+
verbose: bool = typer.Option(
+
help="Show detailed progress information",
+
"""Extract and categorize all outbound links from blog entries.
+
This command analyzes all blog entries to extract outbound links,
+
resolve them properly with respect to the feed's base URL, and
+
categorize them as internal, user, or unknown links.
+
config = load_config(config_file)
+
git_store = GitStore(config.git_store)
+
# Build user domain mapping
+
console.print("Building user domain mapping...")
+
index = git_store._load_index()
+
for username, user_metadata in index.users.items():
+
# Add domains from feeds
+
for feed_url in user_metadata.feeds:
+
domain = urlparse(feed_url).netloc.lower()
+
# Add domain from homepage
+
if user_metadata.homepage:
+
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
+
user_domains[username] = domains
+
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
+
# Initialize components
+
link_extractor = LinkExtractor()
+
categorizer = LinkCategorizer(user_domains)
+
users = list(index.users.keys())
+
console.print("[yellow]No users found in Git store[/yellow]")
+
link_categories = {"internal": [], "user": [], "unknown": []}
+
link_dict = {} # Dictionary with link URL as key, maps to atom ID
+
reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs
+
TextColumn("[progress.description]{task.description}"),
+
# Count total entries first
+
counting_task = progress.add_task("Counting entries...", total=len(users))
+
entries = git_store.list_entries(username)
+
total_entries += len(entries)
+
progress.advance(counting_task)
+
progress.remove_task(counting_task)
+
processing_task = progress.add_task(
+
f"Processing {total_entries} entries...",
+
entries = git_store.list_entries(username)
+
user_metadata = index.users[username]
+
# Get base URL for this user (use first feed URL)
+
base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"
+
# Extract links from this entry
+
entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)
+
# Track unique links per entry
+
entry_urls_seen = set()
+
for link_data in entry_links:
+
# Skip if we've already seen this URL in this entry
+
if link_data.url in entry_urls_seen:
+
entry_urls_seen.add(link_data.url)
+
category, target_username = categorizer.categorize_url(link_data.url, username)
+
# Add to link dictionary (URL as key, maps to atom ID only)
+
if link_data.url not in link_dict:
+
link_dict[link_data.url] = link_data.entry_id
+
# Also add to reverse mapping (atom ID -> list of URLs)
+
if link_data.entry_id not in reverse_dict:
+
reverse_dict[link_data.entry_id] = []
+
reverse_dict[link_data.entry_id].append(link_data.url)
+
# Add category info to link data for categories tracking
+
link_info = link_data.to_dict()
+
link_info["category"] = category
+
link_info["target_username"] = target_username
+
all_links.append(link_info)
+
link_categories[category].append(link_info)
+
progress.advance(processing_task)
+
if verbose and entry_links:
+
console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...")
+
# Determine output paths
+
output_path = output_file
+
output_path = config.git_store / "links.json"
+
mapping_path = mapping_file
+
mapping_path = config.git_store / "url_mapping.json"
+
# Save all extracted links (not just filtered ones)
+
console.print("Preparing output data...")
+
# Build a set of all URLs that correspond to posts in the git database
+
registered_urls = set()
+
# Get all entries from all users and build URL mappings
+
entries = git_store.list_entries(username)
+
user_metadata = index.users[username]
+
# Try to match entry URLs with extracted links
+
if hasattr(entry, 'link') and entry.link:
+
registered_urls.add(entry.link)
+
# Also check entry alternate links if they exist
+
if hasattr(entry, 'links') and entry.links:
+
for link in entry.links:
+
if hasattr(link, 'href') and link.href:
+
registered_urls.add(link.href)
+
# Create filtered version for URL mapping (only links to registered posts)
+
filtered_link_dict = {}
+
filtered_reverse_dict = {}
+
for url, entry_id in link_dict.items():
+
if url in registered_urls:
+
filtered_link_dict[url] = entry_id
+
# Also update reverse mapping
+
if entry_id not in filtered_reverse_dict:
+
filtered_reverse_dict[entry_id] = []
+
filtered_reverse_dict[entry_id].append(url)
+
# Use all links for main output, not filtered ones
+
output_data = link_dict
+
console.print(f"Found {len(link_dict)} total links, {len(filtered_link_dict)} links to registered posts")
+
# Save links data (URL -> atom ID mapping, all links)
+
with open(output_path, "w") as f:
+
json.dump(output_data, f, indent=2, default=str)
+
# Save bidirectional mapping file (filtered)
+
"url_to_atom": filtered_link_dict,
+
"atom_to_urls": filtered_reverse_dict
+
with open(mapping_path, "w") as f:
+
json.dump(mapping_data, f, indent=2, default=str)
+
console.print("\n[green]โ Links extraction completed successfully[/green]")
+
# Create summary table or TSV output
+
print("Category\tCount\tDescription")
+
print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")
+
print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")
+
print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")
+
print(f"Total Extracted\t{len(all_links)}\tAll extracted links")
+
print(f"Saved to Output\t{len(output_data)}\tLinks saved to output file")
+
print(f"Cross-references\t{len(filtered_link_dict)}\tLinks to registered posts only")
+
table = Table(title="Links Summary")
+
table.add_column("Category", style="cyan")
+
table.add_column("Count", style="green")
+
table.add_column("Description", style="white")
+
table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")
+
table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")
+
table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")
+
table.add_row("Total Extracted", str(len(all_links)), "All extracted links")
+
table.add_row("Saved to Output", str(len(output_data)), "Links saved to output file")
+
table.add_row("Cross-references", str(len(filtered_link_dict)), "Links to registered posts only")
+
# Show user links if verbose
+
if verbose and link_categories["user"]:
+
print("User Link Source\tUser Link Target\tLink Count")
+
for link in link_categories["user"]:
+
key = f"{link['username']} -> {link['target_username']}"
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
+
source, target = link_pair.split(" -> ")
+
print(f"{source}\t{target}\t{count}")
+
console.print("\n[bold]User-to-user links:[/bold]")
+
for link in link_categories["user"]:
+
key = f"{link['username']} -> {link['target_username']}"
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
+
console.print(f" {link_pair}: {count} links")
+
console.print(f"\nLinks output saved to: {output_path}")
+
console.print(f"URL mapping saved to: {mapping_path}")
+
console.print(f"[red]Error extracting links: {e}[/red]")
+
console.print_exception()