# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "feedparser",
#   "beautifulsoup4",
#   "urllib3",
# ]
# ///
# Do not delete the above as its needed for `uv run`
#!/usr/bin/env python3

import json
import feedparser
import sys
import os
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin

def extract_links_from_html(html_content, base_url=None):
    """Extract and normalize links from HTML content"""
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href'].strip()
        
        # Skip empty links, anchors, javascript, and mailto
        if not href or href.startswith(('#', 'javascript:', 'mailto:')):
            continue
            
        # Convert relative URLs to absolute if we have a base URL
        if base_url and not href.startswith(('http://', 'https://')):
            href = urljoin(base_url, href)
            
        links.append(href)
    
    return links

def normalize_url(url):
    """Normalize URLs to consistently match them"""
    if not url:
        return ""
        
    # Handle common URL shorteners or redirects (not implemented)
    
    # Parse the URL
    parsed = urlparse(url)
    
    # Ensure scheme is consistent
    scheme = parsed.scheme.lower() or 'http'
    
    # Normalize netloc (lowercase, remove 'www.' prefix optionally)
    netloc = parsed.netloc.lower()
    if netloc.startswith('www.'):
        netloc = netloc[4:]
    
    # Remove trailing slashes and index.html/index.php
    path = parsed.path.rstrip('/')
    for index_file in ['/index.html', '/index.php', '/index.htm']:
        if path.endswith(index_file):
            path = path[:-len(index_file)]
    
    # Remove common fragments and query parameters that don't affect content
    # (like tracking params, utm_*, etc.)
    query_parts = []
    if parsed.query:
        for param in parsed.query.split('&'):
            if '=' in param:
                key, value = param.split('=', 1)
                if not key.startswith(('utm_', 'ref', 'source')):
                    query_parts.append(f"{key}={value}")
    
    query = '&'.join(query_parts)
    
    # Remove common hash fragments
    fragment = ''
    
    # Special case for common blogging platforms
    # Medium, WordPress, Ghost, etc. may have specific URL patterns
    
    # Reconstruct the URL
    normalized = f"{scheme}://{netloc}{path}"
    if query:
        normalized += f"?{query}"
    if fragment:
        normalized += f"#{fragment}"
    
    return normalized

def analyze_feed():
    # Parse the aggregated feed
    print(f"Parsing eeg.xml...", file=sys.stderr)
    feed_data = feedparser.parse("eeg.xml")
    
    # Add debug info about the feed
    print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
    print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
    
    if not feed_data or not hasattr(feed_data, 'entries'):
        print("Error: Could not parse feed or no entries found", file=sys.stderr)
        return
    
    print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
    
    all_entries = []
    entry_urls = {}  # Maps normalized URLs to entry data
    
    # First pass: collect all entries and their URLs
    for entry in feed_data.entries:
        # Get link
        link = entry.get('link', '')
        if not link:
            continue
        
        # Normalize the entry URL to help with matching
        normalized_link = normalize_url(link)
        
        # Get feed title (stored as category in the aggregated feed)
        feed_title = "Unknown"
        if hasattr(entry, 'tags') and entry.tags:
            feed_title = entry.tags[0].term
        
        # Get description/content
        if hasattr(entry, 'content') and entry.content:
            content = entry.content[0].value
        else:
            content = entry.get('summary', '')
        
        # Extract all links from content, using the entry link as base URL for resolving relative URLs
        content_links = extract_links_from_html(content, base_url=link)
        
        # Get unique ID
        entry_id = entry.get('id', link)
        
        entry_data = {
            'title': entry.get('title', 'No title'),
            'link': link,
            'normalized_link': normalized_link,
            'feed_title': feed_title,
            'id': entry_id,
            'content_links': content_links,
            'references': [],  # Will be filled in the second pass
            'referenced_by': [],  # Will be filled in the second pass
            'external_links': [] # Links to content outside the feed
        }
        
        all_entries.append(entry_data)
        entry_urls[normalized_link] = entry_data
    
    print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
    
    # Second pass: analyze links between entries
    for entry in all_entries:
        # Keep track of references to avoid duplicates
        reference_ids = set()
        normalized_content_links = [normalize_url(link) for link in entry['content_links']]
        
        for i, normalized_link in enumerate(normalized_content_links):
            original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
            
            if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
                # This entry links to another entry in the feed
                referenced_entry = entry_urls[normalized_link]
                
                # Avoid duplicate references
                if referenced_entry['id'] in reference_ids:
                    continue
                    
                reference_ids.add(referenced_entry['id'])
            elif normalized_link not in entry_urls and normalized_link != entry['normalized_link']:
                # This is a link to something outside the feed
                # Track as an external link
                if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
                    external_link = {
                        'url': original_link,
                        'normalized_url': normalized_link,
                        'in_feed': False  # Mark as external to the feed
                    }
                    entry['external_links'].append(external_link)
                continue
            
            if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
                # Add to the references of the current entry
                entry['references'].append({
                    'id': referenced_entry['id'],
                    'link': referenced_entry['link'],
                    'title': referenced_entry['title'],
                    'feed_title': referenced_entry['feed_title'],
                    'in_feed': True  # Mark as a reference to a post in the feed
                })
                
                # Add to the referenced_by of the referenced entry
                # Check if this entry is already in referenced_by
                already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
                if not already_referenced:
                    referenced_entry['referenced_by'].append({
                        'id': entry['id'],
                        'link': entry['link'],
                        'title': entry['title'],
                        'feed_title': entry['feed_title'],
                        'in_feed': True  # Mark as a reference from a post in the feed
                    })
    
    # Create the thread data structure
    thread_data = {}
    for entry in all_entries:
        thread_data[entry['id']] = {
            'id': entry['id'],
            'title': entry['title'],
            'link': entry['link'],
            'feed_title': entry['feed_title'],
            'references': entry['references'],
            'referenced_by': entry['referenced_by'],
            'external_links': entry['external_links']
        }
    
    # Write the thread data to a JSON file
    with open('threads.json', 'w') as f:
        json.dump(thread_data, f, indent=2)
    
    print(f"Thread data successfully written to threads.json", file=sys.stderr)
    
    # Generate some statistics
    entries_with_references = sum(1 for entry in all_entries if entry['references'])
    entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
    entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
    total_internal_references = sum(len(entry['references']) for entry in all_entries)
    total_external_links = sum(len(entry['external_links']) for entry in all_entries)
    
    print(f"\nThread Analysis:", file=sys.stderr)
    print(f"Total entries: {len(all_entries)}", file=sys.stderr)
    print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
    print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
    print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
    print(f"Total internal references: {total_internal_references}", file=sys.stderr)
    print(f"Total external links: {total_external_links}", file=sys.stderr)

if __name__ == "__main__":
    analyze_feed()