Atom feed for our EEG site
1# /// script
2# requires-python = ">=3.11"
3# dependencies = [
4# "feedparser",
5# "beautifulsoup4",
6# "urllib3",
7# ]
8# ///
9# Do not delete the above as its needed for `uv run`
10#!/usr/bin/env python3
11
12import json
13import feedparser
14import sys
15import os
16from bs4 import BeautifulSoup
17import re
18from urllib.parse import urlparse, urljoin
19
20def extract_links_from_html(html_content, base_url=None):
21 """Extract and normalize links from HTML content"""
22 soup = BeautifulSoup(html_content, 'html.parser')
23 links = []
24
25 for a_tag in soup.find_all('a', href=True):
26 href = a_tag['href'].strip()
27
28 # Skip empty links, anchors, javascript, and mailto
29 if not href or href.startswith(('#', 'javascript:', 'mailto:')):
30 continue
31
32 # Convert relative URLs to absolute if we have a base URL
33 if base_url and not href.startswith(('http://', 'https://')):
34 href = urljoin(base_url, href)
35
36 links.append(href)
37
38 return links
39
40def normalize_url(url):
41 """Normalize URLs to consistently match them"""
42 if not url:
43 return ""
44
45 # Handle common URL shorteners or redirects (not implemented)
46
47 # Parse the URL
48 parsed = urlparse(url)
49
50 # Ensure scheme is consistent
51 scheme = parsed.scheme.lower() or 'http'
52
53 # Normalize netloc (lowercase, remove 'www.' prefix optionally)
54 netloc = parsed.netloc.lower()
55 if netloc.startswith('www.'):
56 netloc = netloc[4:]
57
58 # Remove trailing slashes and index.html/index.php
59 path = parsed.path.rstrip('/')
60 for index_file in ['/index.html', '/index.php', '/index.htm']:
61 if path.endswith(index_file):
62 path = path[:-len(index_file)]
63
64 # Remove common fragments and query parameters that don't affect content
65 # (like tracking params, utm_*, etc.)
66 query_parts = []
67 if parsed.query:
68 for param in parsed.query.split('&'):
69 if '=' in param:
70 key, value = param.split('=', 1)
71 if not key.startswith(('utm_', 'ref', 'source')):
72 query_parts.append(f"{key}={value}")
73
74 query = '&'.join(query_parts)
75
76 # Remove common hash fragments
77 fragment = ''
78
79 # Special case for common blogging platforms
80 # Medium, WordPress, Ghost, etc. may have specific URL patterns
81
82 # Reconstruct the URL
83 normalized = f"{scheme}://{netloc}{path}"
84 if query:
85 normalized += f"?{query}"
86 if fragment:
87 normalized += f"#{fragment}"
88
89 return normalized
90
91def get_domain(url):
92 """Extract domain from a URL"""
93 parsed = urlparse(url)
94 domain = parsed.netloc.lower()
95 # Remove 'www.' prefix if present
96 if domain.startswith('www.'):
97 domain = domain[4:]
98 return domain
99
100def analyze_feed():
101 # Parse the aggregated feed
102 print(f"Parsing eeg.xml...", file=sys.stderr)
103 feed_data = feedparser.parse("eeg.xml")
104
105 # Add debug info about the feed
106 print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
107 print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
108
109 if not feed_data or not hasattr(feed_data, 'entries'):
110 print("Error: Could not parse feed or no entries found", file=sys.stderr)
111 return
112
113 print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
114
115 all_entries = []
116 entry_urls = {} # Maps normalized URLs to entry data
117
118 # First pass: collect all entries and their URLs
119 for entry in feed_data.entries:
120 # Get link
121 link = entry.get('link', '')
122 if not link:
123 continue
124
125 # Normalize the entry URL to help with matching
126 normalized_link = normalize_url(link)
127
128 # Get the domain of the entry
129 entry_domain = get_domain(link)
130
131 # Get feed title (stored as category in the aggregated feed)
132 feed_title = "Unknown"
133 if hasattr(entry, 'tags') and entry.tags:
134 feed_title = entry.tags[0].term
135
136 # Get description/content
137 if hasattr(entry, 'content') and entry.content:
138 content = entry.content[0].value
139 else:
140 content = entry.get('summary', '')
141
142 # Extract all links from content, using the entry link as base URL for resolving relative URLs
143 content_links = extract_links_from_html(content, base_url=link)
144
145 # Get unique ID
146 entry_id = entry.get('id', link)
147
148 entry_data = {
149 'title': entry.get('title', 'No title'),
150 'link': link,
151 'normalized_link': normalized_link,
152 'domain': entry_domain,
153 'feed_title': feed_title,
154 'id': entry_id,
155 'content_links': content_links,
156 'references': [], # Will be filled in the second pass
157 'referenced_by': [], # Will be filled in the second pass
158 'external_links': [] # Links to content outside the feed
159 }
160
161 all_entries.append(entry_data)
162 entry_urls[normalized_link] = entry_data
163
164 print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
165
166 # Second pass: analyze links between entries
167 for entry in all_entries:
168 # Keep track of references to avoid duplicates
169 reference_ids = set()
170 normalized_content_links = [normalize_url(link) for link in entry['content_links']]
171
172 for i, normalized_link in enumerate(normalized_content_links):
173 original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
174
175 # Check if this is a link to another entry in the feed
176 if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
177 referenced_entry = entry_urls[normalized_link]
178
179 # Avoid duplicate references
180 if referenced_entry['id'] in reference_ids:
181 continue
182
183 reference_ids.add(referenced_entry['id'])
184
185 # Add to the references of the current entry
186 entry['references'].append({
187 'id': referenced_entry['id'],
188 'link': referenced_entry['link'],
189 'title': referenced_entry['title'],
190 'feed_title': referenced_entry['feed_title'],
191 'in_feed': True # Mark as a reference to a post in the feed
192 })
193
194 # Add to the referenced_by of the referenced entry
195 # Check if this entry is already in referenced_by
196 already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
197 if not already_referenced:
198 referenced_entry['referenced_by'].append({
199 'id': entry['id'],
200 'link': entry['link'],
201 'title': entry['title'],
202 'feed_title': entry['feed_title'],
203 'in_feed': True # Mark as a reference from a post in the feed
204 })
205 elif normalized_link != entry['normalized_link']:
206 # This is a link to something outside the feed
207 # Check if it's from the same domain as the entry
208 link_domain = get_domain(original_link)
209
210 # Only include external links from different domains
211 if link_domain != entry['domain']:
212 # Track as an external link if not already in the list
213 if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
214 external_link = {
215 'url': original_link,
216 'normalized_url': normalized_link,
217 'in_feed': False # Mark as external to the feed
218 }
219 entry['external_links'].append(external_link)
220
221 # Create the thread data structure
222 thread_data = {}
223 for entry in all_entries:
224 thread_data[entry['id']] = {
225 'id': entry['id'],
226 'title': entry['title'],
227 'link': entry['link'],
228 'feed_title': entry['feed_title'],
229 'references': entry['references'],
230 'referenced_by': entry['referenced_by'],
231 'external_links': entry['external_links']
232 }
233
234 # Write the thread data to a JSON file
235 with open('threads.json', 'w') as f:
236 json.dump(thread_data, f, indent=2)
237
238 print(f"Thread data successfully written to threads.json", file=sys.stderr)
239
240 # Generate some statistics
241 entries_with_references = sum(1 for entry in all_entries if entry['references'])
242 entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
243 entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
244 total_internal_references = sum(len(entry['references']) for entry in all_entries)
245 total_external_links = sum(len(entry['external_links']) for entry in all_entries)
246
247 print(f"\nThread Analysis:", file=sys.stderr)
248 print(f"Total entries: {len(all_entries)}", file=sys.stderr)
249 print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
250 print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
251 print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
252 print(f"Total internal references: {total_internal_references}", file=sys.stderr)
253 print(f"Total external links: {total_external_links}", file=sys.stderr)
254
255if __name__ == "__main__":
256 analyze_feed()