commit daa0a92072b45205a837e6440261bb215f9f1ed1 · anil.recoil.org/atomic-eeg

+223 -3

aggregate_feeds.py

···

       5
        
       #   "feedgenerator",

     

       6
        
       #   "requests",

     

       7
        
       #   "beautifulsoup4",

     

       0
        
       
     

       8
        
       # ]

     

       9
        
       # ///

     

       10
        
       # Do not delete the above as its needed for `uv run`

     
···

       21
        
       import re

     

       22
        
       from html import unescape

     

       23
        
       from bs4 import BeautifulSoup

     

       0
        
       
     

       24
        
       

     

       25
        
       def load_feed_urls(file_path):

     

       26
        
           with open(file_path, 'r') as f:

     
···

       147
        
                   # Get link

     

       148
        
                   link = entry.get('link', '')

     

       149
        
                   

     

       150
       -
                   # Get description/content

     

       151
        
                   if hasattr(entry, 'content') and entry.content:

     

       152
        
                       content = entry.content[0].value

     

       153
        
                   else:

     

       154
        
                       content = entry.get('summary', '')

     

       155
       -
                   

     

       156
        
                   # Create HTML preview that will be used as the content

     

       157
        
                   preview = create_html_preview(content)

     

       158
        
                   

     
···

       162
        
                   all_entries.append({

     

       163
        
                       'title': title,

     

       164
        
                       'link': link,

     

       165
       -
                       'content': content,

     

       166
        
                       'preview': preview,

     

       167
        
                       'author': author_name,

     

       168
        
                       'pub_date': pub_date,

     
···

       207
        
           

     

       208
        
           return feed

     

       209
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       210
        
       def main():

     

       211
        
           # Load feed URLs

     

       212
        
           feed_urls = load_feed_urls('feed.json')

     
···

       235
        
               feed.write(f, 'utf-8')

     

       236
        
           

     

       237
        
           print(f"Feed successfully written to eeg.xml", file=sys.stderr)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       238
        
       

     

       239
        
       if __name__ == "__main__":

     

       240
        
           main()

···

       5
        
       #   "feedgenerator",

     

       6
        
       #   "requests",

     

       7
        
       #   "beautifulsoup4",

     

       8
       +
       #   "urllib3",

     

       9
        
       # ]

     

       10
        
       # ///

     

       11
        
       # Do not delete the above as its needed for `uv run`

     
···

       22
        
       import re

     

       23
        
       from html import unescape

     

       24
        
       from bs4 import BeautifulSoup

     

       25
       +
       from urllib.parse import urlparse, urljoin

     

       26
        
       

     

       27
        
       def load_feed_urls(file_path):

     

       28
        
           with open(file_path, 'r') as f:

     
···

       149
        
                   # Get link

     

       150
        
                   link = entry.get('link', '')

     

       151
        
                   

     

       152
       +
                   # Get full content from the feed entry

     

       153
        
                   if hasattr(entry, 'content') and entry.content:

     

       154
        
                       content = entry.content[0].value

     

       155
        
                   else:

     

       156
        
                       content = entry.get('summary', '')

     

       157
       +
                       

     

       158
        
                   # Create HTML preview that will be used as the content

     

       159
        
                   preview = create_html_preview(content)

     

       160
        
                   

     
···

       164
        
                   all_entries.append({

     

       165
        
                       'title': title,

     

       166
        
                       'link': link,

     

       167
       +
                       'content': content,  # Use the feed content directly

     

       168
        
                       'preview': preview,

     

       169
        
                       'author': author_name,

     

       170
        
                       'pub_date': pub_date,

     
···

       209
        
           

     

       210
        
           return feed

     

       211
        
       

     

       212
       +
       # Functions from make_threads.py

     

       213
       +
       

     

       214
       +
       def extract_links_from_html(html_content, base_url=None):

     

       215
       +
           """Extract and normalize links from HTML content"""

     

       216
       +
           soup = BeautifulSoup(html_content, 'html.parser')

     

       217
       +
           links = []

     

       218
       +
           

     

       219
       +
           for a_tag in soup.find_all('a', href=True):

     

       220
       +
               href = a_tag['href'].strip()

     

       221
       +
               

     

       222
       +
               # Skip empty links, anchors, javascript, and mailto

     

       223
       +
               if not href or href.startswith(('#', 'javascript:', 'mailto:')):

     

       224
       +
                   continue

     

       225
       +
                   

     

       226
       +
               # Convert relative URLs to absolute if we have a base URL

     

       227
       +
               if base_url and not href.startswith(('http://', 'https://')):

     

       228
       +
                   href = urljoin(base_url, href)

     

       229
       +
                   

     

       230
       +
               links.append(href)

     

       231
       +
           

     

       232
       +
           return links

     

       233
       +
       

     

       234
       +
       def normalize_url(url):

     

       235
       +
           """Normalize URLs to consistently match them"""

     

       236
       +
           if not url:

     

       237
       +
               return ""

     

       238
       +
               

     

       239
       +
           # Handle common URL shorteners or redirects (not implemented)

     

       240
       +
           

     

       241
       +
           # Parse the URL

     

       242
       +
           parsed = urlparse(url)

     

       243
       +
           

     

       244
       +
           # Ensure scheme is consistent

     

       245
       +
           scheme = parsed.scheme.lower() or 'http'

     

       246
       +
           

     

       247
       +
           # Normalize netloc (lowercase, remove 'www.' prefix optionally)

     

       248
       +
           netloc = parsed.netloc.lower()

     

       249
       +
           if netloc.startswith('www.'):

     

       250
       +
               netloc = netloc[4:]

     

       251
       +
           

     

       252
       +
           # Remove trailing slashes and index.html/index.php

     

       253
       +
           path = parsed.path.rstrip('/')

     

       254
       +
           for index_file in ['/index.html', '/index.php', '/index.htm']:

     

       255
       +
               if path.endswith(index_file):

     

       256
       +
                   path = path[:-len(index_file)]

     

       257
       +
           

     

       258
       +
           # Remove common fragments and query parameters that don't affect content

     

       259
       +
           # (like tracking params, utm_*, etc.)

     

       260
       +
           query_parts = []

     

       261
       +
           if parsed.query:

     

       262
       +
               for param in parsed.query.split('&'):

     

       263
       +
                   if '=' in param:

     

       264
       +
                       key, value = param.split('=', 1)

     

       265
       +
                       if not key.startswith(('utm_', 'ref', 'source')):

     

       266
       +
                           query_parts.append(f"{key}={value}")

     

       267
       +
           

     

       268
       +
           query = '&'.join(query_parts)

     

       269
       +
           

     

       270
       +
           # Remove common hash fragments

     

       271
       +
           fragment = ''

     

       272
       +
           

     

       273
       +
           # Special case for common blogging platforms

     

       274
       +
           # Medium, WordPress, Ghost, etc. may have specific URL patterns

     

       275
       +
           

     

       276
       +
           # Reconstruct the URL

     

       277
       +
           normalized = f"{scheme}://{netloc}{path}"

     

       278
       +
           if query:

     

       279
       +
               normalized += f"?{query}"

     

       280
       +
           if fragment:

     

       281
       +
               normalized += f"#{fragment}"

     

       282
       +
           

     

       283
       +
           return normalized

     

       284
       +
       

     

       285
       +
       def get_domain(url):

     

       286
       +
           """Extract domain from a URL"""

     

       287
       +
           parsed = urlparse(url)

     

       288
       +
           domain = parsed.netloc.lower()

     

       289
       +
           # Remove 'www.' prefix if present

     

       290
       +
           if domain.startswith('www.'):

     

       291
       +
               domain = domain[4:]

     

       292
       +
           return domain

     

       293
       +
       

     

       294
       +
       def generate_threads(entries):

     

       295
       +
           """Generate thread data from the entries"""

     

       296
       +
           print(f"Generating thread data from {len(entries)} entries...", file=sys.stderr)

     

       297
       +
           

     

       298
       +
           entry_urls = {}  # Maps normalized URLs to entry data

     

       299
       +
           

     

       300
       +
           # First pass: collect all entries and their URLs

     

       301
       +
           for entry in entries:

     

       302
       +
               # Get link

     

       303
       +
               link = entry['link']

     

       304
       +
               if not link:

     

       305
       +
                   continue

     

       306
       +
               

     

       307
       +
               # Normalize the entry URL to help with matching

     

       308
       +
               normalized_link = normalize_url(link)

     

       309
       +
               

     

       310
       +
               # Get the domain of the entry

     

       311
       +
               entry_domain = get_domain(link)

     

       312
       +
               

     

       313
       +
               # Use the feed content to extract links

     

       314
       +
               content_to_extract = entry['content']

     

       315
       +
               

     

       316
       +
               # Extract all links from content, using the entry link as base URL for resolving relative URLs

     

       317
       +
               content_links = extract_links_from_html(content_to_extract, base_url=link)

     

       318
       +
               

     

       319
       +
               entry_data = {

     

       320
       +
                   'title': entry['title'],

     

       321
       +
                   'link': link,

     

       322
       +
                   'normalized_link': normalized_link,

     

       323
       +
                   'domain': entry_domain,

     

       324
       +
                   'feed_title': entry['feed_title'],

     

       325
       +
                   'id': entry['id'],

     

       326
       +
                   'content_links': content_links,

     

       327
       +
                   'references': [],  # Will be filled in the second pass

     

       328
       +
                   'referenced_by': [],  # Will be filled in the second pass

     

       329
       +
                   'external_links': [] # Links to content outside the feed

     

       330
       +
               }

     

       331
       +
               

     

       332
       +
               entry_urls[normalized_link] = entry_data

     

       333
       +
           

     

       334
       +
           print(f"Extracted links from all entries", file=sys.stderr)

     

       335
       +
           

     

       336
       +
           # Second pass: analyze links between entries

     

       337
       +
           for entry_id, entry_data in entry_urls.items():

     

       338
       +
               # Keep track of references to avoid duplicates

     

       339
       +
               reference_ids = set()

     

       340
       +
               normalized_content_links = [normalize_url(link) for link in entry_data['content_links']]

     

       341
       +
               

     

       342
       +
               for i, normalized_link in enumerate(normalized_content_links):

     

       343
       +
                   original_link = entry_data['content_links'][i] if i < len(entry_data['content_links']) else normalized_link

     

       344
       +
                   

     

       345
       +
                   # Check if this is a link to another entry in the feed

     

       346
       +
                   if normalized_link in entry_urls and normalized_link != entry_data['normalized_link']:

     

       347
       +
                       referenced_entry = entry_urls[normalized_link]

     

       348
       +
                       

     

       349
       +
                       # Avoid duplicate references

     

       350
       +
                       if referenced_entry['id'] in reference_ids:

     

       351
       +
                           continue

     

       352
       +
                           

     

       353
       +
                       reference_ids.add(referenced_entry['id'])

     

       354
       +
                       

     

       355
       +
                       # Add to the references of the current entry

     

       356
       +
                       entry_data['references'].append({

     

       357
       +
                           'id': referenced_entry['id'],

     

       358
       +
                           'link': referenced_entry['link'],

     

       359
       +
                           'title': referenced_entry['title'],

     

       360
       +
                           'feed_title': referenced_entry['feed_title'],

     

       361
       +
                           'in_feed': True  # Mark as a reference to a post in the feed

     

       362
       +
                       })

     

       363
       +
                       

     

       364
       +
                       # Add to the referenced_by of the referenced entry

     

       365
       +
                       # Check if this entry is already in referenced_by

     

       366
       +
                       already_referenced = any(ref['id'] == entry_data['id'] for ref in referenced_entry['referenced_by'])

     

       367
       +
                       if not already_referenced:

     

       368
       +
                           referenced_entry['referenced_by'].append({

     

       369
       +
                               'id': entry_data['id'],

     

       370
       +
                               'link': entry_data['link'],

     

       371
       +
                               'title': entry_data['title'],

     

       372
       +
                               'feed_title': entry_data['feed_title'],

     

       373
       +
                               'in_feed': True  # Mark as a reference from a post in the feed

     

       374
       +
                           })

     

       375
       +
                   elif normalized_link != entry_data['normalized_link']:

     

       376
       +
                       # This is a link to something outside the feed

     

       377
       +
                       # Check if it's from the same domain as the entry

     

       378
       +
                       link_domain = get_domain(original_link)

     

       379
       +
                       

     

       380
       +
                       # Only include external links from different domains

     

       381
       +
                       if link_domain != entry_data['domain']:

     

       382
       +
                           # Track as an external link if not already in the list

     

       383
       +
                           if not any(ext_link['url'] == original_link for ext_link in entry_data['external_links']):

     

       384
       +
                               external_link = {

     

       385
       +
                                   'url': original_link,

     

       386
       +
                                   'normalized_url': normalized_link,

     

       387
       +
                                   'in_feed': False  # Mark as external to the feed

     

       388
       +
                               }

     

       389
       +
                               entry_data['external_links'].append(external_link)

     

       390
       +
           

     

       391
       +
           # Create the thread data structure

     

       392
       +
           thread_data = {}

     

       393
       +
           for _, entry_data in entry_urls.items():

     

       394
       +
               thread_data[entry_data['id']] = {

     

       395
       +
                   'id': entry_data['id'],

     

       396
       +
                   'title': entry_data['title'],

     

       397
       +
                   'link': entry_data['link'],

     

       398
       +
                   'feed_title': entry_data['feed_title'],

     

       399
       +
                   'references': entry_data['references'],

     

       400
       +
                   'referenced_by': entry_data['referenced_by'],

     

       401
       +
                   'external_links': entry_data['external_links']

     

       402
       +
               }

     

       403
       +
           

     

       404
       +
           # Generate some statistics

     

       405
       +
           entries_with_references = sum(1 for entry_data in entry_urls.values() if entry_data['references'])

     

       406
       +
           entries_with_referenced_by = sum(1 for entry_data in entry_urls.values() if entry_data['referenced_by'])

     

       407
       +
           entries_with_external_links = sum(1 for entry_data in entry_urls.values() if entry_data['external_links'])

     

       408
       +
           total_internal_references = sum(len(entry_data['references']) for entry_data in entry_urls.values())

     

       409
       +
           total_external_links = sum(len(entry_data['external_links']) for entry_data in entry_urls.values())

     

       410
       +
           

     

       411
       +
           print(f"\nThread Analysis:", file=sys.stderr)

     

       412
       +
           print(f"Total entries: {len(entry_urls)}", file=sys.stderr)

     

       413
       +
           print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)

     

       414
       +
           print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)

     

       415
       +
           print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)

     

       416
       +
           print(f"Total internal references: {total_internal_references}", file=sys.stderr)

     

       417
       +
           print(f"Total external links: {total_external_links}", file=sys.stderr)

     

       418
       +
           

     

       419
       +
           return thread_data

     

       420
       +
       

     

       421
        
       def main():

     

       422
        
           # Load feed URLs

     

       423
        
           feed_urls = load_feed_urls('feed.json')

     
···

       446
        
               feed.write(f, 'utf-8')

     

       447
        
           

     

       448
        
           print(f"Feed successfully written to eeg.xml", file=sys.stderr)

     

       449
       +
           

     

       450
       +
           # Generate thread data

     

       451
       +
           thread_data = generate_threads(entries)

     

       452
       +
           

     

       453
       +
           # Write the thread data to a JSON file

     

       454
       +
           with open('threads.json', 'w') as f:

     

       455
       +
               json.dump(thread_data, f, indent=2)

     

       456
       +
           

     

       457
       +
           print(f"Thread data successfully written to threads.json", file=sys.stderr)

     

       458
        
       

     

       459
        
       if __name__ == "__main__":

     

       460
        
           main()

-256

make_threads.py

···

       1
       -
       # /// script

     

       2
       -
       # requires-python = ">=3.11"

     

       3
       -
       # dependencies = [

     

       4
       -
       #   "feedparser",

     

       5
       -
       #   "beautifulsoup4",

     

       6
       -
       #   "urllib3",

     

       7
       -
       # ]

     

       8
       -
       # ///

     

       9
       -
       # Do not delete the above as its needed for `uv run`

     

       10
       -
       #!/usr/bin/env python3

     

       11
       -
       

     

       12
       -
       import json

     

       13
       -
       import feedparser

     

       14
       -
       import sys

     

       15
       -
       import os

     

       16
       -
       from bs4 import BeautifulSoup

     

       17
       -
       import re

     

       18
       -
       from urllib.parse import urlparse, urljoin

     

       19
       -
       

     

       20
       -
       def extract_links_from_html(html_content, base_url=None):

     

       21
       -
           """Extract and normalize links from HTML content"""

     

       22
       -
           soup = BeautifulSoup(html_content, 'html.parser')

     

       23
       -
           links = []

     

       24
       -
           

     

       25
       -
           for a_tag in soup.find_all('a', href=True):

     

       26
       -
               href = a_tag['href'].strip()

     

       27
       -
               

     

       28
       -
               # Skip empty links, anchors, javascript, and mailto

     

       29
       -
               if not href or href.startswith(('#', 'javascript:', 'mailto:')):

     

       30
       -
                   continue

     

       31
       -
                   

     

       32
       -
               # Convert relative URLs to absolute if we have a base URL

     

       33
       -
               if base_url and not href.startswith(('http://', 'https://')):

     

       34
       -
                   href = urljoin(base_url, href)

     

       35
       -
                   

     

       36
       -
               links.append(href)

     

       37
       -
           

     

       38
       -
           return links

     

       39
       -
       

     

       40
       -
       def normalize_url(url):

     

       41
       -
           """Normalize URLs to consistently match them"""

     

       42
       -
           if not url:

     

       43
       -
               return ""

     

       44
       -
               

     

       45
       -
           # Handle common URL shorteners or redirects (not implemented)

     

       46
       -
           

     

       47
       -
           # Parse the URL

     

       48
       -
           parsed = urlparse(url)

     

       49
       -
           

     

       50
       -
           # Ensure scheme is consistent

     

       51
       -
           scheme = parsed.scheme.lower() or 'http'

     

       52
       -
           

     

       53
       -
           # Normalize netloc (lowercase, remove 'www.' prefix optionally)

     

       54
       -
           netloc = parsed.netloc.lower()

     

       55
       -
           if netloc.startswith('www.'):

     

       56
       -
               netloc = netloc[4:]

     

       57
       -
           

     

       58
       -
           # Remove trailing slashes and index.html/index.php

     

       59
       -
           path = parsed.path.rstrip('/')

     

       60
       -
           for index_file in ['/index.html', '/index.php', '/index.htm']:

     

       61
       -
               if path.endswith(index_file):

     

       62
       -
                   path = path[:-len(index_file)]

     

       63
       -
           

     

       64
       -
           # Remove common fragments and query parameters that don't affect content

     

       65
       -
           # (like tracking params, utm_*, etc.)

     

       66
       -
           query_parts = []

     

       67
       -
           if parsed.query:

     

       68
       -
               for param in parsed.query.split('&'):

     

       69
       -
                   if '=' in param:

     

       70
       -
                       key, value = param.split('=', 1)

     

       71
       -
                       if not key.startswith(('utm_', 'ref', 'source')):

     

       72
       -
                           query_parts.append(f"{key}={value}")

     

       73
       -
           

     

       74
       -
           query = '&'.join(query_parts)

     

       75
       -
           

     

       76
       -
           # Remove common hash fragments

     

       77
       -
           fragment = ''

     

       78
       -
           

     

       79
       -
           # Special case for common blogging platforms

     

       80
       -
           # Medium, WordPress, Ghost, etc. may have specific URL patterns

     

       81
       -
           

     

       82
       -
           # Reconstruct the URL

     

       83
       -
           normalized = f"{scheme}://{netloc}{path}"

     

       84
       -
           if query:

     

       85
       -
               normalized += f"?{query}"

     

       86
       -
           if fragment:

     

       87
       -
               normalized += f"#{fragment}"

     

       88
       -
           

     

       89
       -
           return normalized

     

       90
       -
       

     

       91
       -
       def get_domain(url):

     

       92
       -
           """Extract domain from a URL"""

     

       93
       -
           parsed = urlparse(url)

     

       94
       -
           domain = parsed.netloc.lower()

     

       95
       -
           # Remove 'www.' prefix if present

     

       96
       -
           if domain.startswith('www.'):

     

       97
       -
               domain = domain[4:]

     

       98
       -
           return domain

     

       99
       -
       

     

       100
       -
       def analyze_feed():

     

       101
       -
           # Parse the aggregated feed

     

       102
       -
           print(f"Parsing eeg.xml...", file=sys.stderr)

     

       103
       -
           feed_data = feedparser.parse("eeg.xml")

     

       104
       -
           

     

       105
       -
           # Add debug info about the feed

     

       106
       -
           print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)

     

       107
       -
           print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)

     

       108
       -
           

     

       109
       -
           if not feed_data or not hasattr(feed_data, 'entries'):

     

       110
       -
               print("Error: Could not parse feed or no entries found", file=sys.stderr)

     

       111
       -
               return

     

       112
       -
           

     

       113
       -
           print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)

     

       114
       -
           

     

       115
       -
           all_entries = []

     

       116
       -
           entry_urls = {}  # Maps normalized URLs to entry data

     

       117
       -
           

     

       118
       -
           # First pass: collect all entries and their URLs

     

       119
       -
           for entry in feed_data.entries:

     

       120
       -
               # Get link

     

       121
       -
               link = entry.get('link', '')

     

       122
       -
               if not link:

     

       123
       -
                   continue

     

       124
       -
               

     

       125
       -
               # Normalize the entry URL to help with matching

     

       126
       -
               normalized_link = normalize_url(link)

     

       127
       -
               

     

       128
       -
               # Get the domain of the entry

     

       129
       -
               entry_domain = get_domain(link)

     

       130
       -
               

     

       131
       -
               # Get feed title (stored as category in the aggregated feed)

     

       132
       -
               feed_title = "Unknown"

     

       133
       -
               if hasattr(entry, 'tags') and entry.tags:

     

       134
       -
                   feed_title = entry.tags[0].term

     

       135
       -
               

     

       136
       -
               # Get description/content

     

       137
       -
               if hasattr(entry, 'content') and entry.content:

     

       138
       -
                   content = entry.content[0].value

     

       139
       -
               else:

     

       140
       -
                   content = entry.get('summary', '')

     

       141
       -
               

     

       142
       -
               # Extract all links from content, using the entry link as base URL for resolving relative URLs

     

       143
       -
               content_links = extract_links_from_html(content, base_url=link)

     

       144
       -
               

     

       145
       -
               # Get unique ID

     

       146
       -
               entry_id = entry.get('id', link)

     

       147
       -
               

     

       148
       -
               entry_data = {

     

       149
       -
                   'title': entry.get('title', 'No title'),

     

       150
       -
                   'link': link,

     

       151
       -
                   'normalized_link': normalized_link,

     

       152
       -
                   'domain': entry_domain,

     

       153
       -
                   'feed_title': feed_title,

     

       154
       -
                   'id': entry_id,

     

       155
       -
                   'content_links': content_links,

     

       156
       -
                   'references': [],  # Will be filled in the second pass

     

       157
       -
                   'referenced_by': [],  # Will be filled in the second pass

     

       158
       -
                   'external_links': [] # Links to content outside the feed

     

       159
       -
               }

     

       160
       -
               

     

       161
       -
               all_entries.append(entry_data)

     

       162
       -
               entry_urls[normalized_link] = entry_data

     

       163
       -
           

     

       164
       -
           print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)

     

       165
       -
           

     

       166
       -
           # Second pass: analyze links between entries

     

       167
       -
           for entry in all_entries:

     

       168
       -
               # Keep track of references to avoid duplicates

     

       169
       -
               reference_ids = set()

     

       170
       -
               normalized_content_links = [normalize_url(link) for link in entry['content_links']]

     

       171
       -
               

     

       172
       -
               for i, normalized_link in enumerate(normalized_content_links):

     

       173
       -
                   original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link

     

       174
       -
                   

     

       175
       -
                   # Check if this is a link to another entry in the feed

     

       176
       -
                   if normalized_link in entry_urls and normalized_link != entry['normalized_link']:

     

       177
       -
                       referenced_entry = entry_urls[normalized_link]

     

       178
       -
                       

     

       179
       -
                       # Avoid duplicate references

     

       180
       -
                       if referenced_entry['id'] in reference_ids:

     

       181
       -
                           continue

     

       182
       -
                           

     

       183
       -
                       reference_ids.add(referenced_entry['id'])

     

       184
       -
                       

     

       185
       -
                       # Add to the references of the current entry

     

       186
       -
                       entry['references'].append({

     

       187
       -
                           'id': referenced_entry['id'],

     

       188
       -
                           'link': referenced_entry['link'],

     

       189
       -
                           'title': referenced_entry['title'],

     

       190
       -
                           'feed_title': referenced_entry['feed_title'],

     

       191
       -
                           'in_feed': True  # Mark as a reference to a post in the feed

     

       192
       -
                       })

     

       193
       -
                       

     

       194
       -
                       # Add to the referenced_by of the referenced entry

     

       195
       -
                       # Check if this entry is already in referenced_by

     

       196
       -
                       already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])

     

       197
       -
                       if not already_referenced:

     

       198
       -
                           referenced_entry['referenced_by'].append({

     

       199
       -
                               'id': entry['id'],

     

       200
       -
                               'link': entry['link'],

     

       201
       -
                               'title': entry['title'],

     

       202
       -
                               'feed_title': entry['feed_title'],

     

       203
       -
                               'in_feed': True  # Mark as a reference from a post in the feed

     

       204
       -
                           })

     

       205
       -
                   elif normalized_link != entry['normalized_link']:

     

       206
       -
                       # This is a link to something outside the feed

     

       207
       -
                       # Check if it's from the same domain as the entry

     

       208
       -
                       link_domain = get_domain(original_link)

     

       209
       -
                       

     

       210
       -
                       # Only include external links from different domains

     

       211
       -
                       if link_domain != entry['domain']:

     

       212
       -
                           # Track as an external link if not already in the list

     

       213
       -
                           if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):

     

       214
       -
                               external_link = {

     

       215
       -
                                   'url': original_link,

     

       216
       -
                                   'normalized_url': normalized_link,

     

       217
       -
                                   'in_feed': False  # Mark as external to the feed

     

       218
       -
                               }

     

       219
       -
                               entry['external_links'].append(external_link)

     

       220
       -
           

     

       221
       -
           # Create the thread data structure

     

       222
       -
           thread_data = {}

     

       223
       -
           for entry in all_entries:

     

       224
       -
               thread_data[entry['id']] = {

     

       225
       -
                   'id': entry['id'],

     

       226
       -
                   'title': entry['title'],

     

       227
       -
                   'link': entry['link'],

     

       228
       -
                   'feed_title': entry['feed_title'],

     

       229
       -
                   'references': entry['references'],

     

       230
       -
                   'referenced_by': entry['referenced_by'],

     

       231
       -
                   'external_links': entry['external_links']

     

       232
       -
               }

     

       233
       -
           

     

       234
       -
           # Write the thread data to a JSON file

     

       235
       -
           with open('threads.json', 'w') as f:

     

       236
       -
               json.dump(thread_data, f, indent=2)

     

       237
       -
           

     

       238
       -
           print(f"Thread data successfully written to threads.json", file=sys.stderr)

     

       239
       -
           

     

       240
       -
           # Generate some statistics

     

       241
       -
           entries_with_references = sum(1 for entry in all_entries if entry['references'])

     

       242
       -
           entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])

     

       243
       -
           entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])

     

       244
       -
           total_internal_references = sum(len(entry['references']) for entry in all_entries)

     

       245
       -
           total_external_links = sum(len(entry['external_links']) for entry in all_entries)

     

       246
       -
           

     

       247
       -
           print(f"\nThread Analysis:", file=sys.stderr)

     

       248
       -
           print(f"Total entries: {len(all_entries)}", file=sys.stderr)

     

       249
       -
           print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)

     

       250
       -
           print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)

     

       251
       -
           print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)

     

       252
       -
           print(f"Total internal references: {total_internal_references}", file=sys.stderr)

     

       253
       -
           print(f"Total external links: {total_external_links}", file=sys.stderr)

     

       254
       -
       

     

       255
       -
       if __name__ == "__main__":

     

       256
       -
           analyze_feed()

+93 -2

threads.json

···

       29
        
             }

     

       30
        
           ]

     

       31
        
         },

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       32
        
         "https://mort.io/blog/coping-and-capping/": {

     

       33
        
           "id": "https://mort.io/blog/coping-and-capping/",

     

       34
        
           "title": "Coping and Capping",

     
···

       908
        
           "link": "https://patrick.sirref.org/weekly-2025-03-31/",

     

       909
        
           "feed_title": "Weeklies",

     

       910
        
           "references": [],

     

       911
       -
           "referenced_by": [],

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       912
        
           "external_links": [

     

       913
        
             {

     

       914
        
               "url": "https://github.com/quantifyearth/shark",

     
···

       1325
        
               "in_feed": true

     

       1326
        
             }

     

       1327
        
           ],

     

       1328
       -
           "referenced_by": [],

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       1329
        
           "external_links": [

     

       1330
        
             {

     

       1331
        
               "url": "https://web.archive.org/",

···

       29
        
             }

     

       30
        
           ]

     

       31
        
         },

     

       32
       +
         "https://www.jonmsterling.com/2025-W15/": {

     

       33
       +
           "id": "https://www.jonmsterling.com/2025-W15/",

     

       34
       +
           "title": "Weeknotes 2025-W15",

     

       35
       +
           "link": "https://www.jonmsterling.com/2025-W15/",

     

       36
       +
           "feed_title": "Jon Sterling \u203a Weeknotes",

     

       37
       +
           "references": [

     

       38
       +
             {

     

       39
       +
               "id": "https://www.forester-notes.org/JVIT/",

     

       40
       +
               "link": "https://www.forester-notes.org/JVIT/",

     

       41
       +
               "title": "Towards Forester 5.0 II: a design for canonical URLs",

     

       42
       +
               "feed_title": "Forester Blog",

     

       43
       +
               "in_feed": true

     

       44
       +
             },

     

       45
       +
             {

     

       46
       +
               "id": "https://patrick.sirref.org/weekly-2025-03-31/",

     

       47
       +
               "link": "https://patrick.sirref.org/weekly-2025-03-31/",

     

       48
       +
               "title": "Shelter, Hazel and More!",

     

       49
       +
               "feed_title": "Weeklies",

     

       50
       +
               "in_feed": true

     

       51
       +
             }

     

       52
       +
           ],

     

       53
       +
           "referenced_by": [],

     

       54
       +
           "external_links": [

     

       55
       +
             {

     

       56
       +
               "url": "https://www.forester-notes.org/jms-011P/",

     

       57
       +
               "normalized_url": "https://forester-notes.org/jms-011P",

     

       58
       +
               "in_feed": false

     

       59
       +
             },

     

       60
       +
             {

     

       61
       +
               "url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb",

     

       62
       +
               "normalized_url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb",

     

       63
       +
               "in_feed": false

     

       64
       +
             },

     

       65
       +
             {

     

       66
       +
               "url": "https://topiary.tweag.io/",

     

       67
       +
               "normalized_url": "https://topiary.tweag.io",

     

       68
       +
               "in_feed": false

     

       69
       +
             },

     

       70
       +
             {

     

       71
       +
               "url": "https://github.com/RedPRL/cooltt",

     

       72
       +
               "normalized_url": "https://github.com/RedPRL/cooltt",

     

       73
       +
               "in_feed": false

     

       74
       +
             },

     

       75
       +
             {

     

       76
       +
               "url": "https://github.com/RedPRL/redtt",

     

       77
       +
               "normalized_url": "https://github.com/RedPRL/redtt",

     

       78
       +
               "in_feed": false

     

       79
       +
             },

     

       80
       +
             {

     

       81
       +
               "url": "https://github.com/RedPRL/sml-redprl",

     

       82
       +
               "normalized_url": "https://github.com/RedPRL/sml-redprl",

     

       83
       +
               "in_feed": false

     

       84
       +
             },

     

       85
       +
             {

     

       86
       +
               "url": "https://lawrencecpaulson.github.io/tag/locales",

     

       87
       +
               "normalized_url": "https://lawrencecpaulson.github.io/tag/locales",

     

       88
       +
               "in_feed": false

     

       89
       +
             },

     

       90
       +
             {

     

       91
       +
               "url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf",

     

       92
       +
               "normalized_url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf",

     

       93
       +
               "in_feed": false

     

       94
       +
             },

     

       95
       +
             {

     

       96
       +
               "url": "https://github.com/agda/agda/issues/5837",

     

       97
       +
               "normalized_url": "https://github.com/agda/agda/issues/5837",

     

       98
       +
               "in_feed": false

     

       99
       +
             },

     

       100
       +
             {

     

       101
       +
               "url": "https://www.abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp",

     

       102
       +
               "normalized_url": "https://abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp",

     

       103
       +
               "in_feed": false

     

       104
       +
             }

     

       105
       +
           ]

     

       106
       +
         },

     

       107
        
         "https://mort.io/blog/coping-and-capping/": {

     

       108
        
           "id": "https://mort.io/blog/coping-and-capping/",

     

       109
        
           "title": "Coping and Capping",

     
···

       983
        
           "link": "https://patrick.sirref.org/weekly-2025-03-31/",

     

       984
        
           "feed_title": "Weeklies",

     

       985
        
           "references": [],

     

       986
       +
           "referenced_by": [

     

       987
       +
             {

     

       988
       +
               "id": "https://www.jonmsterling.com/2025-W15/",

     

       989
       +
               "link": "https://www.jonmsterling.com/2025-W15/",

     

       990
       +
               "title": "Weeknotes 2025-W15",

     

       991
       +
               "feed_title": "Jon Sterling \u203a Weeknotes",

     

       992
       +
               "in_feed": true

     

       993
       +
             }

     

       994
       +
           ],

     

       995
        
           "external_links": [

     

       996
        
             {

     

       997
        
               "url": "https://github.com/quantifyearth/shark",

     
···

       1408
        
               "in_feed": true

     

       1409
        
             }

     

       1410
        
           ],

     

       1411
       +
           "referenced_by": [

     

       1412
       +
             {

     

       1413
       +
               "id": "https://www.jonmsterling.com/2025-W15/",

     

       1414
       +
               "link": "https://www.jonmsterling.com/2025-W15/",

     

       1415
       +
               "title": "Weeknotes 2025-W15",

     

       1416
       +
               "feed_title": "Jon Sterling \u203a Weeknotes",

     

       1417
       +
               "in_feed": true

     

       1418
       +
             }

     

       1419
       +
           ],

     

       1420
        
           "external_links": [

     

       1421
        
             {

     

       1422
        
               "url": "https://web.archive.org/",