···
1
+
"""CLI command for extracting and categorizing all outbound links from blog entries."""
5
+
from pathlib import Path
6
+
from typing import Dict, List, Optional, Set
7
+
from urllib.parse import urljoin, urlparse
10
+
from rich.console import Console
11
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
12
+
from rich.table import Table
14
+
from ...core.git_store import GitStore
15
+
from ..main import app
16
+
from ..utils import load_config, get_tsv_mode
22
+
"""Represents a link found in a blog entry."""
24
+
def __init__(self, url: str, entry_id: str, username: str):
26
+
self.entry_id = entry_id
27
+
self.username = username
29
+
def to_dict(self) -> dict:
30
+
"""Convert to dictionary for JSON serialization."""
33
+
"entry_id": self.entry_id,
34
+
"username": self.username
38
+
def from_dict(cls, data: dict) -> "LinkData":
39
+
"""Create from dictionary."""
42
+
entry_id=data["entry_id"],
43
+
username=data["username"]
47
+
class LinkCategorizer:
48
+
"""Categorizes links as internal, user, or unknown."""
50
+
def __init__(self, user_domains: Dict[str, Set[str]]):
51
+
self.user_domains = user_domains
52
+
# Create reverse mapping of domain -> username
53
+
self.domain_to_user = {}
54
+
for username, domains in user_domains.items():
55
+
for domain in domains:
56
+
self.domain_to_user[domain] = username
58
+
def categorize_url(self, url: str, source_username: str) -> tuple[str, Optional[str]]:
60
+
Categorize a URL as 'internal', 'user', or 'unknown'.
61
+
Returns (category, target_username).
64
+
parsed = urlparse(url)
65
+
domain = parsed.netloc.lower()
67
+
# Check if it's a link to the same user's domain (internal)
68
+
if domain in self.user_domains.get(source_username, set()):
69
+
return "internal", source_username
71
+
# Check if it's a link to another user's domain
72
+
if domain in self.domain_to_user:
73
+
return "user", self.domain_to_user[domain]
75
+
# Everything else is unknown
76
+
return "unknown", None
79
+
return "unknown", None
82
+
class LinkExtractor:
83
+
"""Extracts and resolves links from blog entries."""
86
+
# Pattern for extracting links from HTML
87
+
self.link_pattern = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
88
+
self.url_pattern = re.compile(r'https?://[^\s<>"]+')
90
+
def extract_links_from_html(self, html_content: str, base_url: str) -> List[tuple[str, str]]:
91
+
"""Extract all links from HTML content and resolve them against base URL."""
94
+
# Extract links from <a> tags
95
+
for match in self.link_pattern.finditer(html_content):
96
+
url = match.group(1)
97
+
text = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Remove HTML tags from link text
99
+
# Resolve relative URLs against base URL
100
+
resolved_url = urljoin(base_url, url)
101
+
links.append((resolved_url, text))
106
+
def extract_links_from_entry(self, entry, username: str, base_url: str) -> List[LinkData]:
107
+
"""Extract all links from a blog entry."""
110
+
# Combine all text content for analysis
111
+
content_to_search = []
113
+
content_to_search.append(entry.content)
115
+
content_to_search.append(entry.summary)
117
+
for content in content_to_search:
118
+
extracted_links = self.extract_links_from_html(content, base_url)
120
+
for url, link_text in extracted_links:
122
+
if not url or url.startswith('#'):
125
+
link_data = LinkData(
131
+
links.append(link_data)
138
+
config_file: Optional[Path] = typer.Option(
139
+
Path("thicket.yaml"),
142
+
help="Path to configuration file",
144
+
output_file: Optional[Path] = typer.Option(
148
+
help="Path to output links file (default: links.json in git store)",
150
+
mapping_file: Optional[Path] = typer.Option(
154
+
help="Path to output URL <-> atom ID mapping file (default: url_mapping.json in git store)",
156
+
verbose: bool = typer.Option(
160
+
help="Show detailed progress information",
163
+
"""Extract and categorize all outbound links from blog entries.
165
+
This command analyzes all blog entries to extract outbound links,
166
+
resolve them properly with respect to the feed's base URL, and
167
+
categorize them as internal, user, or unknown links.
170
+
# Load configuration
171
+
config = load_config(config_file)
173
+
# Initialize Git store
174
+
git_store = GitStore(config.git_store)
176
+
# Build user domain mapping
178
+
console.print("Building user domain mapping...")
180
+
index = git_store._load_index()
183
+
for username, user_metadata in index.users.items():
186
+
# Add domains from feeds
187
+
for feed_url in user_metadata.feeds:
188
+
domain = urlparse(feed_url).netloc.lower()
190
+
domains.add(domain)
192
+
# Add domain from homepage
193
+
if user_metadata.homepage:
194
+
domain = urlparse(str(user_metadata.homepage)).netloc.lower()
196
+
domains.add(domain)
198
+
user_domains[username] = domains
201
+
console.print(f"Found {len(user_domains)} users with {sum(len(d) for d in user_domains.values())} total domains")
203
+
# Initialize components
204
+
link_extractor = LinkExtractor()
205
+
categorizer = LinkCategorizer(user_domains)
208
+
users = list(index.users.keys())
211
+
console.print("[yellow]No users found in Git store[/yellow]")
212
+
raise typer.Exit(0)
214
+
# Process all entries
216
+
link_categories = {"internal": [], "user": [], "unknown": []}
217
+
link_dict = {} # Dictionary with link URL as key, maps to atom ID
218
+
reverse_dict = {} # Dictionary with atom ID as key, maps to list of URLs
222
+
TextColumn("[progress.description]{task.description}"),
224
+
TaskProgressColumn(),
228
+
# Count total entries first
229
+
counting_task = progress.add_task("Counting entries...", total=len(users))
232
+
for username in users:
233
+
entries = git_store.list_entries(username)
234
+
total_entries += len(entries)
235
+
progress.advance(counting_task)
237
+
progress.remove_task(counting_task)
240
+
processing_task = progress.add_task(
241
+
f"Processing {total_entries} entries...",
242
+
total=total_entries
245
+
for username in users:
246
+
entries = git_store.list_entries(username)
247
+
user_metadata = index.users[username]
249
+
# Get base URL for this user (use first feed URL)
250
+
base_url = str(user_metadata.feeds[0]) if user_metadata.feeds else "https://example.com"
252
+
for entry in entries:
253
+
# Extract links from this entry
254
+
entry_links = link_extractor.extract_links_from_entry(entry, username, base_url)
256
+
# Track unique links per entry
257
+
entry_urls_seen = set()
259
+
# Categorize each link
260
+
for link_data in entry_links:
261
+
# Skip if we've already seen this URL in this entry
262
+
if link_data.url in entry_urls_seen:
264
+
entry_urls_seen.add(link_data.url)
266
+
category, target_username = categorizer.categorize_url(link_data.url, username)
268
+
# Add to link dictionary (URL as key, maps to atom ID only)
269
+
if link_data.url not in link_dict:
270
+
link_dict[link_data.url] = link_data.entry_id
272
+
# Also add to reverse mapping (atom ID -> list of URLs)
273
+
if link_data.entry_id not in reverse_dict:
274
+
reverse_dict[link_data.entry_id] = []
275
+
reverse_dict[link_data.entry_id].append(link_data.url)
277
+
# Add category info to link data for categories tracking
278
+
link_info = link_data.to_dict()
279
+
link_info["category"] = category
280
+
link_info["target_username"] = target_username
282
+
all_links.append(link_info)
283
+
link_categories[category].append(link_info)
285
+
progress.advance(processing_task)
287
+
if verbose and entry_links:
288
+
console.print(f" Found {len(entry_links)} links in {username}:{entry.title[:50]}...")
290
+
# Determine output paths
292
+
output_path = output_file
294
+
output_path = config.git_store / "links.json"
297
+
mapping_path = mapping_file
299
+
mapping_path = config.git_store / "url_mapping.json"
301
+
# Save all extracted links (not just filtered ones)
303
+
console.print("Preparing output data...")
305
+
# Build a set of all URLs that correspond to posts in the git database
306
+
registered_urls = set()
308
+
# Get all entries from all users and build URL mappings
309
+
for username in users:
310
+
entries = git_store.list_entries(username)
311
+
user_metadata = index.users[username]
313
+
for entry in entries:
314
+
# Try to match entry URLs with extracted links
315
+
if hasattr(entry, 'link') and entry.link:
316
+
registered_urls.add(entry.link)
318
+
# Also check entry alternate links if they exist
319
+
if hasattr(entry, 'links') and entry.links:
320
+
for link in entry.links:
321
+
if hasattr(link, 'href') and link.href:
322
+
registered_urls.add(link.href)
324
+
# Create filtered version for URL mapping (only links to registered posts)
325
+
filtered_link_dict = {}
326
+
filtered_reverse_dict = {}
328
+
for url, entry_id in link_dict.items():
329
+
if url in registered_urls:
330
+
filtered_link_dict[url] = entry_id
332
+
# Also update reverse mapping
333
+
if entry_id not in filtered_reverse_dict:
334
+
filtered_reverse_dict[entry_id] = []
335
+
filtered_reverse_dict[entry_id].append(url)
337
+
# Use all links for main output, not filtered ones
338
+
output_data = link_dict
341
+
console.print(f"Found {len(link_dict)} total links, {len(filtered_link_dict)} links to registered posts")
343
+
# Save links data (URL -> atom ID mapping, all links)
344
+
with open(output_path, "w") as f:
345
+
json.dump(output_data, f, indent=2, default=str)
347
+
# Save bidirectional mapping file (filtered)
349
+
"url_to_atom": filtered_link_dict,
350
+
"atom_to_urls": filtered_reverse_dict
353
+
with open(mapping_path, "w") as f:
354
+
json.dump(mapping_data, f, indent=2, default=str)
357
+
if not get_tsv_mode():
358
+
console.print("\n[green]โ Links extraction completed successfully[/green]")
360
+
# Create summary table or TSV output
362
+
print("Category\tCount\tDescription")
363
+
print(f"Internal\t{len(link_categories['internal'])}\tLinks to same user's domain")
364
+
print(f"User\t{len(link_categories['user'])}\tLinks to other tracked users")
365
+
print(f"Unknown\t{len(link_categories['unknown'])}\tLinks to external sites")
366
+
print(f"Total Extracted\t{len(all_links)}\tAll extracted links")
367
+
print(f"Saved to Output\t{len(output_data)}\tLinks saved to output file")
368
+
print(f"Cross-references\t{len(filtered_link_dict)}\tLinks to registered posts only")
370
+
table = Table(title="Links Summary")
371
+
table.add_column("Category", style="cyan")
372
+
table.add_column("Count", style="green")
373
+
table.add_column("Description", style="white")
375
+
table.add_row("Internal", str(len(link_categories["internal"])), "Links to same user's domain")
376
+
table.add_row("User", str(len(link_categories["user"])), "Links to other tracked users")
377
+
table.add_row("Unknown", str(len(link_categories["unknown"])), "Links to external sites")
378
+
table.add_row("Total Extracted", str(len(all_links)), "All extracted links")
379
+
table.add_row("Saved to Output", str(len(output_data)), "Links saved to output file")
380
+
table.add_row("Cross-references", str(len(filtered_link_dict)), "Links to registered posts only")
382
+
console.print(table)
384
+
# Show user links if verbose
385
+
if verbose and link_categories["user"]:
387
+
print("User Link Source\tUser Link Target\tLink Count")
388
+
user_link_counts = {}
390
+
for link in link_categories["user"]:
391
+
key = f"{link['username']} -> {link['target_username']}"
392
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
394
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
395
+
source, target = link_pair.split(" -> ")
396
+
print(f"{source}\t{target}\t{count}")
398
+
console.print("\n[bold]User-to-user links:[/bold]")
399
+
user_link_counts = {}
401
+
for link in link_categories["user"]:
402
+
key = f"{link['username']} -> {link['target_username']}"
403
+
user_link_counts[key] = user_link_counts.get(key, 0) + 1
405
+
for link_pair, count in sorted(user_link_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
406
+
console.print(f" {link_pair}: {count} links")
408
+
if not get_tsv_mode():
409
+
console.print(f"\nLinks output saved to: {output_path}")
410
+
console.print(f"URL mapping saved to: {mapping_path}")
412
+
except Exception as e:
413
+
console.print(f"[red]Error extracting links: {e}[/red]")
415
+
console.print_exception()
416
+
raise typer.Exit(1)