Atom feed for our EEG site
1# /// script 2# requires-python = ">=3.11" 3# dependencies = [ 4# "feedparser", 5# "beautifulsoup4", 6# "urllib3", 7# ] 8# /// 9# Do not delete the above as its needed for `uv run` 10#!/usr/bin/env python3 11 12import json 13import feedparser 14import sys 15import os 16from bs4 import BeautifulSoup 17import re 18from urllib.parse import urlparse, urljoin 19 20def extract_links_from_html(html_content, base_url=None): 21 """Extract and normalize links from HTML content""" 22 soup = BeautifulSoup(html_content, 'html.parser') 23 links = [] 24 25 for a_tag in soup.find_all('a', href=True): 26 href = a_tag['href'].strip() 27 28 # Skip empty links, anchors, javascript, and mailto 29 if not href or href.startswith(('#', 'javascript:', 'mailto:')): 30 continue 31 32 # Convert relative URLs to absolute if we have a base URL 33 if base_url and not href.startswith(('http://', 'https://')): 34 href = urljoin(base_url, href) 35 36 links.append(href) 37 38 return links 39 40def normalize_url(url): 41 """Normalize URLs to consistently match them""" 42 if not url: 43 return "" 44 45 # Handle common URL shorteners or redirects (not implemented) 46 47 # Parse the URL 48 parsed = urlparse(url) 49 50 # Ensure scheme is consistent 51 scheme = parsed.scheme.lower() or 'http' 52 53 # Normalize netloc (lowercase, remove 'www.' prefix optionally) 54 netloc = parsed.netloc.lower() 55 if netloc.startswith('www.'): 56 netloc = netloc[4:] 57 58 # Remove trailing slashes and index.html/index.php 59 path = parsed.path.rstrip('/') 60 for index_file in ['/index.html', '/index.php', '/index.htm']: 61 if path.endswith(index_file): 62 path = path[:-len(index_file)] 63 64 # Remove common fragments and query parameters that don't affect content 65 # (like tracking params, utm_*, etc.) 66 query_parts = [] 67 if parsed.query: 68 for param in parsed.query.split('&'): 69 if '=' in param: 70 key, value = param.split('=', 1) 71 if not key.startswith(('utm_', 'ref', 'source')): 72 query_parts.append(f"{key}={value}") 73 74 query = '&'.join(query_parts) 75 76 # Remove common hash fragments 77 fragment = '' 78 79 # Special case for common blogging platforms 80 # Medium, WordPress, Ghost, etc. may have specific URL patterns 81 82 # Reconstruct the URL 83 normalized = f"{scheme}://{netloc}{path}" 84 if query: 85 normalized += f"?{query}" 86 if fragment: 87 normalized += f"#{fragment}" 88 89 return normalized 90 91def get_domain(url): 92 """Extract domain from a URL""" 93 parsed = urlparse(url) 94 domain = parsed.netloc.lower() 95 # Remove 'www.' prefix if present 96 if domain.startswith('www.'): 97 domain = domain[4:] 98 return domain 99 100def analyze_feed(): 101 # Parse the aggregated feed 102 print(f"Parsing eeg.xml...", file=sys.stderr) 103 feed_data = feedparser.parse("eeg.xml") 104 105 # Add debug info about the feed 106 print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr) 107 print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr) 108 109 if not feed_data or not hasattr(feed_data, 'entries'): 110 print("Error: Could not parse feed or no entries found", file=sys.stderr) 111 return 112 113 print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr) 114 115 all_entries = [] 116 entry_urls = {} # Maps normalized URLs to entry data 117 118 # First pass: collect all entries and their URLs 119 for entry in feed_data.entries: 120 # Get link 121 link = entry.get('link', '') 122 if not link: 123 continue 124 125 # Normalize the entry URL to help with matching 126 normalized_link = normalize_url(link) 127 128 # Get the domain of the entry 129 entry_domain = get_domain(link) 130 131 # Get feed title (stored as category in the aggregated feed) 132 feed_title = "Unknown" 133 if hasattr(entry, 'tags') and entry.tags: 134 feed_title = entry.tags[0].term 135 136 # Get description/content 137 if hasattr(entry, 'content') and entry.content: 138 content = entry.content[0].value 139 else: 140 content = entry.get('summary', '') 141 142 # Extract all links from content, using the entry link as base URL for resolving relative URLs 143 content_links = extract_links_from_html(content, base_url=link) 144 145 # Get unique ID 146 entry_id = entry.get('id', link) 147 148 entry_data = { 149 'title': entry.get('title', 'No title'), 150 'link': link, 151 'normalized_link': normalized_link, 152 'domain': entry_domain, 153 'feed_title': feed_title, 154 'id': entry_id, 155 'content_links': content_links, 156 'references': [], # Will be filled in the second pass 157 'referenced_by': [], # Will be filled in the second pass 158 'external_links': [] # Links to content outside the feed 159 } 160 161 all_entries.append(entry_data) 162 entry_urls[normalized_link] = entry_data 163 164 print(f"Total entries processed: {len(all_entries)}", file=sys.stderr) 165 166 # Second pass: analyze links between entries 167 for entry in all_entries: 168 # Keep track of references to avoid duplicates 169 reference_ids = set() 170 normalized_content_links = [normalize_url(link) for link in entry['content_links']] 171 172 for i, normalized_link in enumerate(normalized_content_links): 173 original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link 174 175 # Check if this is a link to another entry in the feed 176 if normalized_link in entry_urls and normalized_link != entry['normalized_link']: 177 referenced_entry = entry_urls[normalized_link] 178 179 # Avoid duplicate references 180 if referenced_entry['id'] in reference_ids: 181 continue 182 183 reference_ids.add(referenced_entry['id']) 184 185 # Add to the references of the current entry 186 entry['references'].append({ 187 'id': referenced_entry['id'], 188 'link': referenced_entry['link'], 189 'title': referenced_entry['title'], 190 'feed_title': referenced_entry['feed_title'], 191 'in_feed': True # Mark as a reference to a post in the feed 192 }) 193 194 # Add to the referenced_by of the referenced entry 195 # Check if this entry is already in referenced_by 196 already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by']) 197 if not already_referenced: 198 referenced_entry['referenced_by'].append({ 199 'id': entry['id'], 200 'link': entry['link'], 201 'title': entry['title'], 202 'feed_title': entry['feed_title'], 203 'in_feed': True # Mark as a reference from a post in the feed 204 }) 205 elif normalized_link != entry['normalized_link']: 206 # This is a link to something outside the feed 207 # Check if it's from the same domain as the entry 208 link_domain = get_domain(original_link) 209 210 # Only include external links from different domains 211 if link_domain != entry['domain']: 212 # Track as an external link if not already in the list 213 if not any(ext_link['url'] == original_link for ext_link in entry['external_links']): 214 external_link = { 215 'url': original_link, 216 'normalized_url': normalized_link, 217 'in_feed': False # Mark as external to the feed 218 } 219 entry['external_links'].append(external_link) 220 221 # Create the thread data structure 222 thread_data = {} 223 for entry in all_entries: 224 thread_data[entry['id']] = { 225 'id': entry['id'], 226 'title': entry['title'], 227 'link': entry['link'], 228 'feed_title': entry['feed_title'], 229 'references': entry['references'], 230 'referenced_by': entry['referenced_by'], 231 'external_links': entry['external_links'] 232 } 233 234 # Write the thread data to a JSON file 235 with open('threads.json', 'w') as f: 236 json.dump(thread_data, f, indent=2) 237 238 print(f"Thread data successfully written to threads.json", file=sys.stderr) 239 240 # Generate some statistics 241 entries_with_references = sum(1 for entry in all_entries if entry['references']) 242 entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by']) 243 entries_with_external_links = sum(1 for entry in all_entries if entry['external_links']) 244 total_internal_references = sum(len(entry['references']) for entry in all_entries) 245 total_external_links = sum(len(entry['external_links']) for entry in all_entries) 246 247 print(f"\nThread Analysis:", file=sys.stderr) 248 print(f"Total entries: {len(all_entries)}", file=sys.stderr) 249 print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr) 250 print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr) 251 print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr) 252 print(f"Total internal references: {total_internal_references}", file=sys.stderr) 253 print(f"Total external links: {total_external_links}", file=sys.stderr) 254 255if __name__ == "__main__": 256 analyze_feed()