···
2
-
# requires-python = ">=3.11"
9
-
# Do not delete the above as its needed for `uv run`
10
-
#!/usr/bin/env python3
16
-
from bs4 import BeautifulSoup
18
-
from urllib.parse import urlparse, urljoin
20
-
def extract_links_from_html(html_content, base_url=None):
21
-
"""Extract and normalize links from HTML content"""
22
-
soup = BeautifulSoup(html_content, 'html.parser')
25
-
for a_tag in soup.find_all('a', href=True):
26
-
href = a_tag['href'].strip()
28
-
# Skip empty links, anchors, javascript, and mailto
29
-
if not href or href.startswith(('#', 'javascript:', 'mailto:')):
32
-
# Convert relative URLs to absolute if we have a base URL
33
-
if base_url and not href.startswith(('http://', 'https://')):
34
-
href = urljoin(base_url, href)
40
-
def normalize_url(url):
41
-
"""Normalize URLs to consistently match them"""
45
-
# Handle common URL shorteners or redirects (not implemented)
48
-
parsed = urlparse(url)
50
-
# Ensure scheme is consistent
51
-
scheme = parsed.scheme.lower() or 'http'
53
-
# Normalize netloc (lowercase, remove 'www.' prefix optionally)
54
-
netloc = parsed.netloc.lower()
55
-
if netloc.startswith('www.'):
58
-
# Remove trailing slashes and index.html/index.php
59
-
path = parsed.path.rstrip('/')
60
-
for index_file in ['/index.html', '/index.php', '/index.htm']:
61
-
if path.endswith(index_file):
62
-
path = path[:-len(index_file)]
64
-
# Remove common fragments and query parameters that don't affect content
65
-
# (like tracking params, utm_*, etc.)
68
-
for param in parsed.query.split('&'):
70
-
key, value = param.split('=', 1)
71
-
if not key.startswith(('utm_', 'ref', 'source')):
72
-
query_parts.append(f"{key}={value}")
74
-
query = '&'.join(query_parts)
76
-
# Remove common hash fragments
79
-
# Special case for common blogging platforms
80
-
# Medium, WordPress, Ghost, etc. may have specific URL patterns
82
-
# Reconstruct the URL
83
-
normalized = f"{scheme}://{netloc}{path}"
85
-
normalized += f"?{query}"
87
-
normalized += f"#{fragment}"
91
-
def get_domain(url):
92
-
"""Extract domain from a URL"""
93
-
parsed = urlparse(url)
94
-
domain = parsed.netloc.lower()
95
-
# Remove 'www.' prefix if present
96
-
if domain.startswith('www.'):
100
-
def analyze_feed():
101
-
# Parse the aggregated feed
102
-
print(f"Parsing eeg.xml...", file=sys.stderr)
103
-
feed_data = feedparser.parse("eeg.xml")
105
-
# Add debug info about the feed
106
-
print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
107
-
print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
109
-
if not feed_data or not hasattr(feed_data, 'entries'):
110
-
print("Error: Could not parse feed or no entries found", file=sys.stderr)
113
-
print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
116
-
entry_urls = {} # Maps normalized URLs to entry data
118
-
# First pass: collect all entries and their URLs
119
-
for entry in feed_data.entries:
121
-
link = entry.get('link', '')
125
-
# Normalize the entry URL to help with matching
126
-
normalized_link = normalize_url(link)
128
-
# Get the domain of the entry
129
-
entry_domain = get_domain(link)
131
-
# Get feed title (stored as category in the aggregated feed)
132
-
feed_title = "Unknown"
133
-
if hasattr(entry, 'tags') and entry.tags:
134
-
feed_title = entry.tags[0].term
136
-
# Get description/content
137
-
if hasattr(entry, 'content') and entry.content:
138
-
content = entry.content[0].value
140
-
content = entry.get('summary', '')
142
-
# Extract all links from content, using the entry link as base URL for resolving relative URLs
143
-
content_links = extract_links_from_html(content, base_url=link)
146
-
entry_id = entry.get('id', link)
149
-
'title': entry.get('title', 'No title'),
151
-
'normalized_link': normalized_link,
152
-
'domain': entry_domain,
153
-
'feed_title': feed_title,
155
-
'content_links': content_links,
156
-
'references': [], # Will be filled in the second pass
157
-
'referenced_by': [], # Will be filled in the second pass
158
-
'external_links': [] # Links to content outside the feed
161
-
all_entries.append(entry_data)
162
-
entry_urls[normalized_link] = entry_data
164
-
print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
166
-
# Second pass: analyze links between entries
167
-
for entry in all_entries:
168
-
# Keep track of references to avoid duplicates
169
-
reference_ids = set()
170
-
normalized_content_links = [normalize_url(link) for link in entry['content_links']]
172
-
for i, normalized_link in enumerate(normalized_content_links):
173
-
original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
175
-
# Check if this is a link to another entry in the feed
176
-
if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
177
-
referenced_entry = entry_urls[normalized_link]
179
-
# Avoid duplicate references
180
-
if referenced_entry['id'] in reference_ids:
183
-
reference_ids.add(referenced_entry['id'])
185
-
# Add to the references of the current entry
186
-
entry['references'].append({
187
-
'id': referenced_entry['id'],
188
-
'link': referenced_entry['link'],
189
-
'title': referenced_entry['title'],
190
-
'feed_title': referenced_entry['feed_title'],
191
-
'in_feed': True # Mark as a reference to a post in the feed
194
-
# Add to the referenced_by of the referenced entry
195
-
# Check if this entry is already in referenced_by
196
-
already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
197
-
if not already_referenced:
198
-
referenced_entry['referenced_by'].append({
200
-
'link': entry['link'],
201
-
'title': entry['title'],
202
-
'feed_title': entry['feed_title'],
203
-
'in_feed': True # Mark as a reference from a post in the feed
205
-
elif normalized_link != entry['normalized_link']:
206
-
# This is a link to something outside the feed
207
-
# Check if it's from the same domain as the entry
208
-
link_domain = get_domain(original_link)
210
-
# Only include external links from different domains
211
-
if link_domain != entry['domain']:
212
-
# Track as an external link if not already in the list
213
-
if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
215
-
'url': original_link,
216
-
'normalized_url': normalized_link,
217
-
'in_feed': False # Mark as external to the feed
219
-
entry['external_links'].append(external_link)
221
-
# Create the thread data structure
223
-
for entry in all_entries:
224
-
thread_data[entry['id']] = {
226
-
'title': entry['title'],
227
-
'link': entry['link'],
228
-
'feed_title': entry['feed_title'],
229
-
'references': entry['references'],
230
-
'referenced_by': entry['referenced_by'],
231
-
'external_links': entry['external_links']
234
-
# Write the thread data to a JSON file
235
-
with open('threads.json', 'w') as f:
236
-
json.dump(thread_data, f, indent=2)
238
-
print(f"Thread data successfully written to threads.json", file=sys.stderr)
240
-
# Generate some statistics
241
-
entries_with_references = sum(1 for entry in all_entries if entry['references'])
242
-
entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
243
-
entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
244
-
total_internal_references = sum(len(entry['references']) for entry in all_entries)
245
-
total_external_links = sum(len(entry['external_links']) for entry in all_entries)
247
-
print(f"\nThread Analysis:", file=sys.stderr)
248
-
print(f"Total entries: {len(all_entries)}", file=sys.stderr)
249
-
print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
250
-
print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
251
-
print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
252
-
print(f"Total internal references: {total_internal_references}", file=sys.stderr)
253
-
print(f"Total external links: {total_external_links}", file=sys.stderr)
255
-
if __name__ == "__main__":