Atom feed for our EEG site

more

+223 -3
aggregate_feeds.py
···
# "feedgenerator",
# "requests",
# "beautifulsoup4",
# ]
# ///
# Do not delete the above as its needed for `uv run`
···
import re
from html import unescape
from bs4 import BeautifulSoup
def load_feed_urls(file_path):
with open(file_path, 'r') as f:
···
# Get link
link = entry.get('link', '')
-
# Get description/content
if hasattr(entry, 'content') and entry.content:
content = entry.content[0].value
else:
content = entry.get('summary', '')
-
# Create HTML preview that will be used as the content
preview = create_html_preview(content)
···
all_entries.append({
'title': title,
'link': link,
-
'content': content,
'preview': preview,
'author': author_name,
'pub_date': pub_date,
···
return feed
def main():
# Load feed URLs
feed_urls = load_feed_urls('feed.json')
···
feed.write(f, 'utf-8')
print(f"Feed successfully written to eeg.xml", file=sys.stderr)
if __name__ == "__main__":
main()
···
# "feedgenerator",
# "requests",
# "beautifulsoup4",
+
# "urllib3",
# ]
# ///
# Do not delete the above as its needed for `uv run`
···
import re
from html import unescape
from bs4 import BeautifulSoup
+
from urllib.parse import urlparse, urljoin
def load_feed_urls(file_path):
with open(file_path, 'r') as f:
···
# Get link
link = entry.get('link', '')
+
# Get full content from the feed entry
if hasattr(entry, 'content') and entry.content:
content = entry.content[0].value
else:
content = entry.get('summary', '')
+
# Create HTML preview that will be used as the content
preview = create_html_preview(content)
···
all_entries.append({
'title': title,
'link': link,
+
'content': content, # Use the feed content directly
'preview': preview,
'author': author_name,
'pub_date': pub_date,
···
return feed
+
# Functions from make_threads.py
+
+
def extract_links_from_html(html_content, base_url=None):
+
"""Extract and normalize links from HTML content"""
+
soup = BeautifulSoup(html_content, 'html.parser')
+
links = []
+
+
for a_tag in soup.find_all('a', href=True):
+
href = a_tag['href'].strip()
+
+
# Skip empty links, anchors, javascript, and mailto
+
if not href or href.startswith(('#', 'javascript:', 'mailto:')):
+
continue
+
+
# Convert relative URLs to absolute if we have a base URL
+
if base_url and not href.startswith(('http://', 'https://')):
+
href = urljoin(base_url, href)
+
+
links.append(href)
+
+
return links
+
+
def normalize_url(url):
+
"""Normalize URLs to consistently match them"""
+
if not url:
+
return ""
+
+
# Handle common URL shorteners or redirects (not implemented)
+
+
# Parse the URL
+
parsed = urlparse(url)
+
+
# Ensure scheme is consistent
+
scheme = parsed.scheme.lower() or 'http'
+
+
# Normalize netloc (lowercase, remove 'www.' prefix optionally)
+
netloc = parsed.netloc.lower()
+
if netloc.startswith('www.'):
+
netloc = netloc[4:]
+
+
# Remove trailing slashes and index.html/index.php
+
path = parsed.path.rstrip('/')
+
for index_file in ['/index.html', '/index.php', '/index.htm']:
+
if path.endswith(index_file):
+
path = path[:-len(index_file)]
+
+
# Remove common fragments and query parameters that don't affect content
+
# (like tracking params, utm_*, etc.)
+
query_parts = []
+
if parsed.query:
+
for param in parsed.query.split('&'):
+
if '=' in param:
+
key, value = param.split('=', 1)
+
if not key.startswith(('utm_', 'ref', 'source')):
+
query_parts.append(f"{key}={value}")
+
+
query = '&'.join(query_parts)
+
+
# Remove common hash fragments
+
fragment = ''
+
+
# Special case for common blogging platforms
+
# Medium, WordPress, Ghost, etc. may have specific URL patterns
+
+
# Reconstruct the URL
+
normalized = f"{scheme}://{netloc}{path}"
+
if query:
+
normalized += f"?{query}"
+
if fragment:
+
normalized += f"#{fragment}"
+
+
return normalized
+
+
def get_domain(url):
+
"""Extract domain from a URL"""
+
parsed = urlparse(url)
+
domain = parsed.netloc.lower()
+
# Remove 'www.' prefix if present
+
if domain.startswith('www.'):
+
domain = domain[4:]
+
return domain
+
+
def generate_threads(entries):
+
"""Generate thread data from the entries"""
+
print(f"Generating thread data from {len(entries)} entries...", file=sys.stderr)
+
+
entry_urls = {} # Maps normalized URLs to entry data
+
+
# First pass: collect all entries and their URLs
+
for entry in entries:
+
# Get link
+
link = entry['link']
+
if not link:
+
continue
+
+
# Normalize the entry URL to help with matching
+
normalized_link = normalize_url(link)
+
+
# Get the domain of the entry
+
entry_domain = get_domain(link)
+
+
# Use the feed content to extract links
+
content_to_extract = entry['content']
+
+
# Extract all links from content, using the entry link as base URL for resolving relative URLs
+
content_links = extract_links_from_html(content_to_extract, base_url=link)
+
+
entry_data = {
+
'title': entry['title'],
+
'link': link,
+
'normalized_link': normalized_link,
+
'domain': entry_domain,
+
'feed_title': entry['feed_title'],
+
'id': entry['id'],
+
'content_links': content_links,
+
'references': [], # Will be filled in the second pass
+
'referenced_by': [], # Will be filled in the second pass
+
'external_links': [] # Links to content outside the feed
+
}
+
+
entry_urls[normalized_link] = entry_data
+
+
print(f"Extracted links from all entries", file=sys.stderr)
+
+
# Second pass: analyze links between entries
+
for entry_id, entry_data in entry_urls.items():
+
# Keep track of references to avoid duplicates
+
reference_ids = set()
+
normalized_content_links = [normalize_url(link) for link in entry_data['content_links']]
+
+
for i, normalized_link in enumerate(normalized_content_links):
+
original_link = entry_data['content_links'][i] if i < len(entry_data['content_links']) else normalized_link
+
+
# Check if this is a link to another entry in the feed
+
if normalized_link in entry_urls and normalized_link != entry_data['normalized_link']:
+
referenced_entry = entry_urls[normalized_link]
+
+
# Avoid duplicate references
+
if referenced_entry['id'] in reference_ids:
+
continue
+
+
reference_ids.add(referenced_entry['id'])
+
+
# Add to the references of the current entry
+
entry_data['references'].append({
+
'id': referenced_entry['id'],
+
'link': referenced_entry['link'],
+
'title': referenced_entry['title'],
+
'feed_title': referenced_entry['feed_title'],
+
'in_feed': True # Mark as a reference to a post in the feed
+
})
+
+
# Add to the referenced_by of the referenced entry
+
# Check if this entry is already in referenced_by
+
already_referenced = any(ref['id'] == entry_data['id'] for ref in referenced_entry['referenced_by'])
+
if not already_referenced:
+
referenced_entry['referenced_by'].append({
+
'id': entry_data['id'],
+
'link': entry_data['link'],
+
'title': entry_data['title'],
+
'feed_title': entry_data['feed_title'],
+
'in_feed': True # Mark as a reference from a post in the feed
+
})
+
elif normalized_link != entry_data['normalized_link']:
+
# This is a link to something outside the feed
+
# Check if it's from the same domain as the entry
+
link_domain = get_domain(original_link)
+
+
# Only include external links from different domains
+
if link_domain != entry_data['domain']:
+
# Track as an external link if not already in the list
+
if not any(ext_link['url'] == original_link for ext_link in entry_data['external_links']):
+
external_link = {
+
'url': original_link,
+
'normalized_url': normalized_link,
+
'in_feed': False # Mark as external to the feed
+
}
+
entry_data['external_links'].append(external_link)
+
+
# Create the thread data structure
+
thread_data = {}
+
for _, entry_data in entry_urls.items():
+
thread_data[entry_data['id']] = {
+
'id': entry_data['id'],
+
'title': entry_data['title'],
+
'link': entry_data['link'],
+
'feed_title': entry_data['feed_title'],
+
'references': entry_data['references'],
+
'referenced_by': entry_data['referenced_by'],
+
'external_links': entry_data['external_links']
+
}
+
+
# Generate some statistics
+
entries_with_references = sum(1 for entry_data in entry_urls.values() if entry_data['references'])
+
entries_with_referenced_by = sum(1 for entry_data in entry_urls.values() if entry_data['referenced_by'])
+
entries_with_external_links = sum(1 for entry_data in entry_urls.values() if entry_data['external_links'])
+
total_internal_references = sum(len(entry_data['references']) for entry_data in entry_urls.values())
+
total_external_links = sum(len(entry_data['external_links']) for entry_data in entry_urls.values())
+
+
print(f"\nThread Analysis:", file=sys.stderr)
+
print(f"Total entries: {len(entry_urls)}", file=sys.stderr)
+
print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
+
print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
+
print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
+
print(f"Total internal references: {total_internal_references}", file=sys.stderr)
+
print(f"Total external links: {total_external_links}", file=sys.stderr)
+
+
return thread_data
+
def main():
# Load feed URLs
feed_urls = load_feed_urls('feed.json')
···
feed.write(f, 'utf-8')
print(f"Feed successfully written to eeg.xml", file=sys.stderr)
+
+
# Generate thread data
+
thread_data = generate_threads(entries)
+
+
# Write the thread data to a JSON file
+
with open('threads.json', 'w') as f:
+
json.dump(thread_data, f, indent=2)
+
+
print(f"Thread data successfully written to threads.json", file=sys.stderr)
if __name__ == "__main__":
main()
-256
make_threads.py
···
-
# /// script
-
# requires-python = ">=3.11"
-
# dependencies = [
-
# "feedparser",
-
# "beautifulsoup4",
-
# "urllib3",
-
# ]
-
# ///
-
# Do not delete the above as its needed for `uv run`
-
#!/usr/bin/env python3
-
-
import json
-
import feedparser
-
import sys
-
import os
-
from bs4 import BeautifulSoup
-
import re
-
from urllib.parse import urlparse, urljoin
-
-
def extract_links_from_html(html_content, base_url=None):
-
"""Extract and normalize links from HTML content"""
-
soup = BeautifulSoup(html_content, 'html.parser')
-
links = []
-
-
for a_tag in soup.find_all('a', href=True):
-
href = a_tag['href'].strip()
-
-
# Skip empty links, anchors, javascript, and mailto
-
if not href or href.startswith(('#', 'javascript:', 'mailto:')):
-
continue
-
-
# Convert relative URLs to absolute if we have a base URL
-
if base_url and not href.startswith(('http://', 'https://')):
-
href = urljoin(base_url, href)
-
-
links.append(href)
-
-
return links
-
-
def normalize_url(url):
-
"""Normalize URLs to consistently match them"""
-
if not url:
-
return ""
-
-
# Handle common URL shorteners or redirects (not implemented)
-
-
# Parse the URL
-
parsed = urlparse(url)
-
-
# Ensure scheme is consistent
-
scheme = parsed.scheme.lower() or 'http'
-
-
# Normalize netloc (lowercase, remove 'www.' prefix optionally)
-
netloc = parsed.netloc.lower()
-
if netloc.startswith('www.'):
-
netloc = netloc[4:]
-
-
# Remove trailing slashes and index.html/index.php
-
path = parsed.path.rstrip('/')
-
for index_file in ['/index.html', '/index.php', '/index.htm']:
-
if path.endswith(index_file):
-
path = path[:-len(index_file)]
-
-
# Remove common fragments and query parameters that don't affect content
-
# (like tracking params, utm_*, etc.)
-
query_parts = []
-
if parsed.query:
-
for param in parsed.query.split('&'):
-
if '=' in param:
-
key, value = param.split('=', 1)
-
if not key.startswith(('utm_', 'ref', 'source')):
-
query_parts.append(f"{key}={value}")
-
-
query = '&'.join(query_parts)
-
-
# Remove common hash fragments
-
fragment = ''
-
-
# Special case for common blogging platforms
-
# Medium, WordPress, Ghost, etc. may have specific URL patterns
-
-
# Reconstruct the URL
-
normalized = f"{scheme}://{netloc}{path}"
-
if query:
-
normalized += f"?{query}"
-
if fragment:
-
normalized += f"#{fragment}"
-
-
return normalized
-
-
def get_domain(url):
-
"""Extract domain from a URL"""
-
parsed = urlparse(url)
-
domain = parsed.netloc.lower()
-
# Remove 'www.' prefix if present
-
if domain.startswith('www.'):
-
domain = domain[4:]
-
return domain
-
-
def analyze_feed():
-
# Parse the aggregated feed
-
print(f"Parsing eeg.xml...", file=sys.stderr)
-
feed_data = feedparser.parse("eeg.xml")
-
-
# Add debug info about the feed
-
print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
-
print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
-
-
if not feed_data or not hasattr(feed_data, 'entries'):
-
print("Error: Could not parse feed or no entries found", file=sys.stderr)
-
return
-
-
print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
-
-
all_entries = []
-
entry_urls = {} # Maps normalized URLs to entry data
-
-
# First pass: collect all entries and their URLs
-
for entry in feed_data.entries:
-
# Get link
-
link = entry.get('link', '')
-
if not link:
-
continue
-
-
# Normalize the entry URL to help with matching
-
normalized_link = normalize_url(link)
-
-
# Get the domain of the entry
-
entry_domain = get_domain(link)
-
-
# Get feed title (stored as category in the aggregated feed)
-
feed_title = "Unknown"
-
if hasattr(entry, 'tags') and entry.tags:
-
feed_title = entry.tags[0].term
-
-
# Get description/content
-
if hasattr(entry, 'content') and entry.content:
-
content = entry.content[0].value
-
else:
-
content = entry.get('summary', '')
-
-
# Extract all links from content, using the entry link as base URL for resolving relative URLs
-
content_links = extract_links_from_html(content, base_url=link)
-
-
# Get unique ID
-
entry_id = entry.get('id', link)
-
-
entry_data = {
-
'title': entry.get('title', 'No title'),
-
'link': link,
-
'normalized_link': normalized_link,
-
'domain': entry_domain,
-
'feed_title': feed_title,
-
'id': entry_id,
-
'content_links': content_links,
-
'references': [], # Will be filled in the second pass
-
'referenced_by': [], # Will be filled in the second pass
-
'external_links': [] # Links to content outside the feed
-
}
-
-
all_entries.append(entry_data)
-
entry_urls[normalized_link] = entry_data
-
-
print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
-
-
# Second pass: analyze links between entries
-
for entry in all_entries:
-
# Keep track of references to avoid duplicates
-
reference_ids = set()
-
normalized_content_links = [normalize_url(link) for link in entry['content_links']]
-
-
for i, normalized_link in enumerate(normalized_content_links):
-
original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
-
-
# Check if this is a link to another entry in the feed
-
if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
-
referenced_entry = entry_urls[normalized_link]
-
-
# Avoid duplicate references
-
if referenced_entry['id'] in reference_ids:
-
continue
-
-
reference_ids.add(referenced_entry['id'])
-
-
# Add to the references of the current entry
-
entry['references'].append({
-
'id': referenced_entry['id'],
-
'link': referenced_entry['link'],
-
'title': referenced_entry['title'],
-
'feed_title': referenced_entry['feed_title'],
-
'in_feed': True # Mark as a reference to a post in the feed
-
})
-
-
# Add to the referenced_by of the referenced entry
-
# Check if this entry is already in referenced_by
-
already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
-
if not already_referenced:
-
referenced_entry['referenced_by'].append({
-
'id': entry['id'],
-
'link': entry['link'],
-
'title': entry['title'],
-
'feed_title': entry['feed_title'],
-
'in_feed': True # Mark as a reference from a post in the feed
-
})
-
elif normalized_link != entry['normalized_link']:
-
# This is a link to something outside the feed
-
# Check if it's from the same domain as the entry
-
link_domain = get_domain(original_link)
-
-
# Only include external links from different domains
-
if link_domain != entry['domain']:
-
# Track as an external link if not already in the list
-
if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
-
external_link = {
-
'url': original_link,
-
'normalized_url': normalized_link,
-
'in_feed': False # Mark as external to the feed
-
}
-
entry['external_links'].append(external_link)
-
-
# Create the thread data structure
-
thread_data = {}
-
for entry in all_entries:
-
thread_data[entry['id']] = {
-
'id': entry['id'],
-
'title': entry['title'],
-
'link': entry['link'],
-
'feed_title': entry['feed_title'],
-
'references': entry['references'],
-
'referenced_by': entry['referenced_by'],
-
'external_links': entry['external_links']
-
}
-
-
# Write the thread data to a JSON file
-
with open('threads.json', 'w') as f:
-
json.dump(thread_data, f, indent=2)
-
-
print(f"Thread data successfully written to threads.json", file=sys.stderr)
-
-
# Generate some statistics
-
entries_with_references = sum(1 for entry in all_entries if entry['references'])
-
entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
-
entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
-
total_internal_references = sum(len(entry['references']) for entry in all_entries)
-
total_external_links = sum(len(entry['external_links']) for entry in all_entries)
-
-
print(f"\nThread Analysis:", file=sys.stderr)
-
print(f"Total entries: {len(all_entries)}", file=sys.stderr)
-
print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
-
print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
-
print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
-
print(f"Total internal references: {total_internal_references}", file=sys.stderr)
-
print(f"Total external links: {total_external_links}", file=sys.stderr)
-
-
if __name__ == "__main__":
-
analyze_feed()
···
+93 -2
threads.json
···
}
]
},
"https://mort.io/blog/coping-and-capping/": {
"id": "https://mort.io/blog/coping-and-capping/",
"title": "Coping and Capping",
···
"link": "https://patrick.sirref.org/weekly-2025-03-31/",
"feed_title": "Weeklies",
"references": [],
-
"referenced_by": [],
"external_links": [
{
"url": "https://github.com/quantifyearth/shark",
···
"in_feed": true
}
],
-
"referenced_by": [],
"external_links": [
{
"url": "https://web.archive.org/",
···
}
]
},
+
"https://www.jonmsterling.com/2025-W15/": {
+
"id": "https://www.jonmsterling.com/2025-W15/",
+
"title": "Weeknotes 2025-W15",
+
"link": "https://www.jonmsterling.com/2025-W15/",
+
"feed_title": "Jon Sterling \u203a Weeknotes",
+
"references": [
+
{
+
"id": "https://www.forester-notes.org/JVIT/",
+
"link": "https://www.forester-notes.org/JVIT/",
+
"title": "Towards Forester 5.0 II: a design for canonical URLs",
+
"feed_title": "Forester Blog",
+
"in_feed": true
+
},
+
{
+
"id": "https://patrick.sirref.org/weekly-2025-03-31/",
+
"link": "https://patrick.sirref.org/weekly-2025-03-31/",
+
"title": "Shelter, Hazel and More!",
+
"feed_title": "Weeklies",
+
"in_feed": true
+
}
+
],
+
"referenced_by": [],
+
"external_links": [
+
{
+
"url": "https://www.forester-notes.org/jms-011P/",
+
"normalized_url": "https://forester-notes.org/jms-011P",
+
"in_feed": false
+
},
+
{
+
"url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb",
+
"normalized_url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb",
+
"in_feed": false
+
},
+
{
+
"url": "https://topiary.tweag.io/",
+
"normalized_url": "https://topiary.tweag.io",
+
"in_feed": false
+
},
+
{
+
"url": "https://github.com/RedPRL/cooltt",
+
"normalized_url": "https://github.com/RedPRL/cooltt",
+
"in_feed": false
+
},
+
{
+
"url": "https://github.com/RedPRL/redtt",
+
"normalized_url": "https://github.com/RedPRL/redtt",
+
"in_feed": false
+
},
+
{
+
"url": "https://github.com/RedPRL/sml-redprl",
+
"normalized_url": "https://github.com/RedPRL/sml-redprl",
+
"in_feed": false
+
},
+
{
+
"url": "https://lawrencecpaulson.github.io/tag/locales",
+
"normalized_url": "https://lawrencecpaulson.github.io/tag/locales",
+
"in_feed": false
+
},
+
{
+
"url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf",
+
"normalized_url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf",
+
"in_feed": false
+
},
+
{
+
"url": "https://github.com/agda/agda/issues/5837",
+
"normalized_url": "https://github.com/agda/agda/issues/5837",
+
"in_feed": false
+
},
+
{
+
"url": "https://www.abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp",
+
"normalized_url": "https://abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp",
+
"in_feed": false
+
}
+
]
+
},
"https://mort.io/blog/coping-and-capping/": {
"id": "https://mort.io/blog/coping-and-capping/",
"title": "Coping and Capping",
···
"link": "https://patrick.sirref.org/weekly-2025-03-31/",
"feed_title": "Weeklies",
"references": [],
+
"referenced_by": [
+
{
+
"id": "https://www.jonmsterling.com/2025-W15/",
+
"link": "https://www.jonmsterling.com/2025-W15/",
+
"title": "Weeknotes 2025-W15",
+
"feed_title": "Jon Sterling \u203a Weeknotes",
+
"in_feed": true
+
}
+
],
"external_links": [
{
"url": "https://github.com/quantifyearth/shark",
···
"in_feed": true
}
],
+
"referenced_by": [
+
{
+
"id": "https://www.jonmsterling.com/2025-W15/",
+
"link": "https://www.jonmsterling.com/2025-W15/",
+
"title": "Weeknotes 2025-W15",
+
"feed_title": "Jon Sterling \u203a Weeknotes",
+
"in_feed": true
+
}
+
],
"external_links": [
{
"url": "https://web.archive.org/",