# /// script # requires-python = ">=3.11" # dependencies = [ # "feedparser", # "beautifulsoup4", # "urllib3", # ] # /// # Do not delete the above as its needed for `uv run` #!/usr/bin/env python3 import json import feedparser import sys import os from bs4 import BeautifulSoup import re from urllib.parse import urlparse, urljoin def extract_links_from_html(html_content, base_url=None): """Extract and normalize links from HTML content""" soup = BeautifulSoup(html_content, 'html.parser') links = [] for a_tag in soup.find_all('a', href=True): href = a_tag['href'].strip() # Skip empty links, anchors, javascript, and mailto if not href or href.startswith(('#', 'javascript:', 'mailto:')): continue # Convert relative URLs to absolute if we have a base URL if base_url and not href.startswith(('http://', 'https://')): href = urljoin(base_url, href) links.append(href) return links def normalize_url(url): """Normalize URLs to consistently match them""" if not url: return "" # Handle common URL shorteners or redirects (not implemented) # Parse the URL parsed = urlparse(url) # Ensure scheme is consistent scheme = parsed.scheme.lower() or 'http' # Normalize netloc (lowercase, remove 'www.' prefix optionally) netloc = parsed.netloc.lower() if netloc.startswith('www.'): netloc = netloc[4:] # Remove trailing slashes and index.html/index.php path = parsed.path.rstrip('/') for index_file in ['/index.html', '/index.php', '/index.htm']: if path.endswith(index_file): path = path[:-len(index_file)] # Remove common fragments and query parameters that don't affect content # (like tracking params, utm_*, etc.) query_parts = [] if parsed.query: for param in parsed.query.split('&'): if '=' in param: key, value = param.split('=', 1) if not key.startswith(('utm_', 'ref', 'source')): query_parts.append(f"{key}={value}") query = '&'.join(query_parts) # Remove common hash fragments fragment = '' # Special case for common blogging platforms # Medium, WordPress, Ghost, etc. may have specific URL patterns # Reconstruct the URL normalized = f"{scheme}://{netloc}{path}" if query: normalized += f"?{query}" if fragment: normalized += f"#{fragment}" return normalized def analyze_feed(): # Parse the aggregated feed print(f"Parsing eeg.xml...", file=sys.stderr) feed_data = feedparser.parse("eeg.xml") # Add debug info about the feed print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr) print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr) if not feed_data or not hasattr(feed_data, 'entries'): print("Error: Could not parse feed or no entries found", file=sys.stderr) return print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr) all_entries = [] entry_urls = {} # Maps normalized URLs to entry data # First pass: collect all entries and their URLs for entry in feed_data.entries: # Get link link = entry.get('link', '') if not link: continue # Normalize the entry URL to help with matching normalized_link = normalize_url(link) # Get feed title (stored as category in the aggregated feed) feed_title = "Unknown" if hasattr(entry, 'tags') and entry.tags: feed_title = entry.tags[0].term # Get description/content if hasattr(entry, 'content') and entry.content: content = entry.content[0].value else: content = entry.get('summary', '') # Extract all links from content, using the entry link as base URL for resolving relative URLs content_links = extract_links_from_html(content, base_url=link) # Get unique ID entry_id = entry.get('id', link) entry_data = { 'title': entry.get('title', 'No title'), 'link': link, 'normalized_link': normalized_link, 'feed_title': feed_title, 'id': entry_id, 'content_links': content_links, 'references': [], # Will be filled in the second pass 'referenced_by': [], # Will be filled in the second pass 'external_links': [] # Links to content outside the feed } all_entries.append(entry_data) entry_urls[normalized_link] = entry_data print(f"Total entries processed: {len(all_entries)}", file=sys.stderr) # Second pass: analyze links between entries for entry in all_entries: # Keep track of references to avoid duplicates reference_ids = set() normalized_content_links = [normalize_url(link) for link in entry['content_links']] for i, normalized_link in enumerate(normalized_content_links): original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link if normalized_link in entry_urls and normalized_link != entry['normalized_link']: # This entry links to another entry in the feed referenced_entry = entry_urls[normalized_link] # Avoid duplicate references if referenced_entry['id'] in reference_ids: continue reference_ids.add(referenced_entry['id']) elif normalized_link not in entry_urls and normalized_link != entry['normalized_link']: # This is a link to something outside the feed # Track as an external link if not any(ext_link['url'] == original_link for ext_link in entry['external_links']): external_link = { 'url': original_link, 'normalized_url': normalized_link, 'in_feed': False # Mark as external to the feed } entry['external_links'].append(external_link) continue if normalized_link in entry_urls and normalized_link != entry['normalized_link']: # Add to the references of the current entry entry['references'].append({ 'id': referenced_entry['id'], 'link': referenced_entry['link'], 'title': referenced_entry['title'], 'feed_title': referenced_entry['feed_title'], 'in_feed': True # Mark as a reference to a post in the feed }) # Add to the referenced_by of the referenced entry # Check if this entry is already in referenced_by already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by']) if not already_referenced: referenced_entry['referenced_by'].append({ 'id': entry['id'], 'link': entry['link'], 'title': entry['title'], 'feed_title': entry['feed_title'], 'in_feed': True # Mark as a reference from a post in the feed }) # Create the thread data structure thread_data = {} for entry in all_entries: thread_data[entry['id']] = { 'id': entry['id'], 'title': entry['title'], 'link': entry['link'], 'feed_title': entry['feed_title'], 'references': entry['references'], 'referenced_by': entry['referenced_by'], 'external_links': entry['external_links'] } # Write the thread data to a JSON file with open('threads.json', 'w') as f: json.dump(thread_data, f, indent=2) print(f"Thread data successfully written to threads.json", file=sys.stderr) # Generate some statistics entries_with_references = sum(1 for entry in all_entries if entry['references']) entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by']) entries_with_external_links = sum(1 for entry in all_entries if entry['external_links']) total_internal_references = sum(len(entry['references']) for entry in all_entries) total_external_links = sum(len(entry['external_links']) for entry in all_entries) print(f"\nThread Analysis:", file=sys.stderr) print(f"Total entries: {len(all_entries)}", file=sys.stderr) print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr) print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr) print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr) print(f"Total internal references: {total_internal_references}", file=sys.stderr) print(f"Total external links: {total_external_links}", file=sys.stderr) if __name__ == "__main__": analyze_feed()