make_threads.py at d2b0cd78e4c0352f5dcd1353e8ddabb9d667f577 · anil.recoil.org/atomic-eeg

Atom feed for our EEG site
atomic-eeg / make_threads.py
at d2b0cd78e4c0352f5dcd1353e8ddabb9d667f577 10 kB view raw
  1# /// script
  2# requires-python = ">=3.11"
  3# dependencies = [
  4#   "feedparser",
  5#   "beautifulsoup4",
  6#   "urllib3",
  7# ]
  8# ///
  9# Do not delete the above as its needed for `uv run`
 10#!/usr/bin/env python3
 11
 12import json
 13import feedparser
 14import sys
 15import os
 16from bs4 import BeautifulSoup
 17import re
 18from urllib.parse import urlparse, urljoin
 19
 20def extract_links_from_html(html_content, base_url=None):
 21    """Extract and normalize links from HTML content"""
 22    soup = BeautifulSoup(html_content, 'html.parser')
 23    links = []
 24    
 25    for a_tag in soup.find_all('a', href=True):
 26        href = a_tag['href'].strip()
 27        
 28        # Skip empty links, anchors, javascript, and mailto
 29        if not href or href.startswith(('#', 'javascript:', 'mailto:')):
 30            continue
 31            
 32        # Convert relative URLs to absolute if we have a base URL
 33        if base_url and not href.startswith(('http://', 'https://')):
 34            href = urljoin(base_url, href)
 35            
 36        links.append(href)
 37    
 38    return links
 39
 40def normalize_url(url):
 41    """Normalize URLs to consistently match them"""
 42    if not url:
 43        return ""
 44        
 45    # Handle common URL shorteners or redirects (not implemented)
 46    
 47    # Parse the URL
 48    parsed = urlparse(url)
 49    
 50    # Ensure scheme is consistent
 51    scheme = parsed.scheme.lower() or 'http'
 52    
 53    # Normalize netloc (lowercase, remove 'www.' prefix optionally)
 54    netloc = parsed.netloc.lower()
 55    if netloc.startswith('www.'):
 56        netloc = netloc[4:]
 57    
 58    # Remove trailing slashes and index.html/index.php
 59    path = parsed.path.rstrip('/')
 60    for index_file in ['/index.html', '/index.php', '/index.htm']:
 61        if path.endswith(index_file):
 62            path = path[:-len(index_file)]
 63    
 64    # Remove common fragments and query parameters that don't affect content
 65    # (like tracking params, utm_*, etc.)
 66    query_parts = []
 67    if parsed.query:
 68        for param in parsed.query.split('&'):
 69            if '=' in param:
 70                key, value = param.split('=', 1)
 71                if not key.startswith(('utm_', 'ref', 'source')):
 72                    query_parts.append(f"{key}={value}")
 73    
 74    query = '&'.join(query_parts)
 75    
 76    # Remove common hash fragments
 77    fragment = ''
 78    
 79    # Special case for common blogging platforms
 80    # Medium, WordPress, Ghost, etc. may have specific URL patterns
 81    
 82    # Reconstruct the URL
 83    normalized = f"{scheme}://{netloc}{path}"
 84    if query:
 85        normalized += f"?{query}"
 86    if fragment:
 87        normalized += f"#{fragment}"
 88    
 89    return normalized
 90
 91def get_domain(url):
 92    """Extract domain from a URL"""
 93    parsed = urlparse(url)
 94    domain = parsed.netloc.lower()
 95    # Remove 'www.' prefix if present
 96    if domain.startswith('www.'):
 97        domain = domain[4:]
 98    return domain
 99
100def analyze_feed():
101    # Parse the aggregated feed
102    print(f"Parsing eeg.xml...", file=sys.stderr)
103    feed_data = feedparser.parse("eeg.xml")
104    
105    # Add debug info about the feed
106    print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
107    print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
108    
109    if not feed_data or not hasattr(feed_data, 'entries'):
110        print("Error: Could not parse feed or no entries found", file=sys.stderr)
111        return
112    
113    print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
114    
115    all_entries = []
116    entry_urls = {}  # Maps normalized URLs to entry data
117    
118    # First pass: collect all entries and their URLs
119    for entry in feed_data.entries:
120        # Get link
121        link = entry.get('link', '')
122        if not link:
123            continue
124        
125        # Normalize the entry URL to help with matching
126        normalized_link = normalize_url(link)
127        
128        # Get the domain of the entry
129        entry_domain = get_domain(link)
130        
131        # Get feed title (stored as category in the aggregated feed)
132        feed_title = "Unknown"
133        if hasattr(entry, 'tags') and entry.tags:
134            feed_title = entry.tags[0].term
135        
136        # Get description/content
137        if hasattr(entry, 'content') and entry.content:
138            content = entry.content[0].value
139        else:
140            content = entry.get('summary', '')
141        
142        # Extract all links from content, using the entry link as base URL for resolving relative URLs
143        content_links = extract_links_from_html(content, base_url=link)
144        
145        # Get unique ID
146        entry_id = entry.get('id', link)
147        
148        entry_data = {
149            'title': entry.get('title', 'No title'),
150            'link': link,
151            'normalized_link': normalized_link,
152            'domain': entry_domain,
153            'feed_title': feed_title,
154            'id': entry_id,
155            'content_links': content_links,
156            'references': [],  # Will be filled in the second pass
157            'referenced_by': [],  # Will be filled in the second pass
158            'external_links': [] # Links to content outside the feed
159        }
160        
161        all_entries.append(entry_data)
162        entry_urls[normalized_link] = entry_data
163    
164    print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
165    
166    # Second pass: analyze links between entries
167    for entry in all_entries:
168        # Keep track of references to avoid duplicates
169        reference_ids = set()
170        normalized_content_links = [normalize_url(link) for link in entry['content_links']]
171        
172        for i, normalized_link in enumerate(normalized_content_links):
173            original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
174            
175            # Check if this is a link to another entry in the feed
176            if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
177                referenced_entry = entry_urls[normalized_link]
178                
179                # Avoid duplicate references
180                if referenced_entry['id'] in reference_ids:
181                    continue
182                    
183                reference_ids.add(referenced_entry['id'])
184                
185                # Add to the references of the current entry
186                entry['references'].append({
187                    'id': referenced_entry['id'],
188                    'link': referenced_entry['link'],
189                    'title': referenced_entry['title'],
190                    'feed_title': referenced_entry['feed_title'],
191                    'in_feed': True  # Mark as a reference to a post in the feed
192                })
193                
194                # Add to the referenced_by of the referenced entry
195                # Check if this entry is already in referenced_by
196                already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
197                if not already_referenced:
198                    referenced_entry['referenced_by'].append({
199                        'id': entry['id'],
200                        'link': entry['link'],
201                        'title': entry['title'],
202                        'feed_title': entry['feed_title'],
203                        'in_feed': True  # Mark as a reference from a post in the feed
204                    })
205            elif normalized_link != entry['normalized_link']:
206                # This is a link to something outside the feed
207                # Check if it's from the same domain as the entry
208                link_domain = get_domain(original_link)
209                
210                # Only include external links from different domains
211                if link_domain != entry['domain']:
212                    # Track as an external link if not already in the list
213                    if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
214                        external_link = {
215                            'url': original_link,
216                            'normalized_url': normalized_link,
217                            'in_feed': False  # Mark as external to the feed
218                        }
219                        entry['external_links'].append(external_link)
220    
221    # Create the thread data structure
222    thread_data = {}
223    for entry in all_entries:
224        thread_data[entry['id']] = {
225            'id': entry['id'],
226            'title': entry['title'],
227            'link': entry['link'],
228            'feed_title': entry['feed_title'],
229            'references': entry['references'],
230            'referenced_by': entry['referenced_by'],
231            'external_links': entry['external_links']
232        }
233    
234    # Write the thread data to a JSON file
235    with open('threads.json', 'w') as f:
236        json.dump(thread_data, f, indent=2)
237    
238    print(f"Thread data successfully written to threads.json", file=sys.stderr)
239    
240    # Generate some statistics
241    entries_with_references = sum(1 for entry in all_entries if entry['references'])
242    entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
243    entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
244    total_internal_references = sum(len(entry['references']) for entry in all_entries)
245    total_external_links = sum(len(entry['external_links']) for entry in all_entries)
246    
247    print(f"\nThread Analysis:", file=sys.stderr)
248    print(f"Total entries: {len(all_entries)}", file=sys.stderr)
249    print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
250    print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
251    print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
252    print(f"Total internal references: {total_internal_references}", file=sys.stderr)
253    print(f"Total external links: {total_external_links}", file=sys.stderr)
254
255if __name__ == "__main__":
256    analyze_feed()