#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "bs4",
# ]
# ///
"""
odoc2json.py - Convert odoc JSON output to structured JSON records

This script parses the JSON output files from odoc-driver (an OCaml documentation
generator) and converts them into structured JSON records that include package name,
version, and each function signature with associated documentation.

The output is intended for further processing, analysis, and search over OCaml type 
signatures, especially for loading into columnar formats like Parquet.
"""

import os
import json
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
import argparse
from pathlib import Path


def extract_package_info(path: str) -> Tuple[str, str]:
    """
    Extract package name and version from the path.
    
    Args:
        path: Path to the odoc output directory
        
    Returns:
        Tuple of (package_name, package_version)
    """
    # Use Path for more reliable path parsing
    p = Path(path).resolve()
    parts = list(p.parts)
    
    # If the path is in the format ".../package_name/version/..."
    if len(parts) >= 2:
        # The package name is typically the second-to-last component
        # The version is typically the last component
        return parts[-2], parts[-1]
    elif len(parts) == 1:
        # If only one component, assume it's the package name
        return parts[0], "unknown"
    else:
        return "unknown", "unknown"


def parse_html_content(content: str) -> List[Dict[str, Any]]:
    """
    Parse the HTML content from the odoc JSON to extract signatures and documentation.
    
    Args:
        content: HTML content from the odoc JSON file
        
    Returns:
        List of dictionaries containing extracted information
    """
    soup = BeautifulSoup(content, 'html.parser')
    result = []
    
    # Process each specification block (function, type, module, etc.)
    for spec in soup.find_all(class_="odoc-spec"):
        item = {}
        
        # Get the spec element (contains the signature)
        spec_elem = spec.find(class_="spec")
        if not spec_elem:
            continue
            
        # Determine the kind of element
        kind = None
        for cls in spec_elem.get('class', []):
            if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
                kind = cls
                break
                
        if not kind:
            continue
            
        item['kind'] = kind
        
        # Extract the signature first to use for name extraction if needed
        code_elem = spec_elem.find('code')
        signature = ""
        if code_elem:
            # Get the full signature text and strip all newlines and normalize whitespace
            signature = code_elem.get_text()
        
        # Extract the name
        name = None
        
        # First try to get name from anchor ID
        anchor = spec_elem.find('a', class_="anchor")
        if anchor and anchor.get('id'):
            item_id = anchor.get('id')
            # Clean up the ID to get the name
            name = item_id.split('.')[-1] if '.' in item_id else item_id
            # Remove prefixes like 'type-', 'val-', etc.
            name = re.sub(r'^(type|val|module|class|exception)-', '', name)
        
        # For values (functions), extract the name from signature as a fallback
        # This handles cases where the anchor doesn't contain the function name
        if kind == 'value' and not name and signature:
            # Look for "val name :" pattern in the signature
            val_match = re.search(r'val\s+(\w+)\s*:', signature)
            if val_match:
                name = val_match.group(1)
        
        if name:
            item['name'] = name
        
        # Add the processed signature
        if signature:
            # Replace newlines and multiple whitespace with a single space
            signature = re.sub(r'\s+', ' ', signature)
            item['signature'] = signature.strip()
            
        # Extract documentation
        doc_elem = spec.find(class_="spec-doc")
        if doc_elem:
            # Get the raw HTML content and remove all HTML tags
            html_content = str(doc_elem)
            # First, convert <br> tags to spaces
            html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
            # Parse the modified HTML
            soup_doc = BeautifulSoup(html_content, 'html.parser')
            # Get text with all whitespace normalized
            doc = soup_doc.get_text()
            # Replace all newlines and multiple spaces with a single space
            doc = re.sub(r'\s+', ' ', doc)
            item['documentation'] = doc.strip()
            
        # Add the item to our results
        result.append(item)
        
    return result


def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
    """
    Process a single odoc JSON file and extract the relevant information.
    
    Args:
        file_path: Path to the JSON file
        package_name: Name of the package
        package_version: Version of the package
        
    Returns:
        List of dictionaries containing extracted information
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}")
            return []
    
    if 'content' not in data:
        return []

    # Extract module path from breadcrumbs
    module_path = []
    if 'breadcrumbs' in data:
        for crumb in data['breadcrumbs']:
            if crumb.get('kind') == 'module':
                module_path.append(crumb.get('name'))
    
    module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
    
    # Extract items from the content
    items = parse_html_content(data['content'])
    
    # Add package and module information to each item
    for item in items:
        item['package_name'] = package_name
        item['package_version'] = package_version
        item['module_name'] = module_name
        
        # Create a full path for the item that includes the item name
        # - module_name: just the module hierarchy (e.g., "Math.Operations")
        # - full_path: complete path including item name (e.g., "Math.Operations.add")
        if 'name' in item:
            item['full_path'] = f"{module_name}.{item['name']}"
        else:
            item['full_path'] = module_name

    return items


def process_directory(directory: str) -> List[Dict[str, Any]]:
    """
    Process all JSON files in a directory recursively.
    
    Args:
        directory: Path to the directory containing odoc JSON files
        
    Returns:
        List of all extracted items from all files
    """
    all_items = []
    package_name, package_version = extract_package_info(directory)
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.html.json'):
                file_path = os.path.join(root, file)
                items = process_json_file(file_path, package_name, package_version)
                all_items.extend(items)
    
    return all_items


def main():
    parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
    parser.add_argument('input_dir', help='Directory containing odoc JSON output')
    parser.add_argument('output_file', help='Output JSON file path')
    parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
    args = parser.parse_args()
    
    # Process all files in the directory
    items = process_directory(args.input_dir)
    
    # Write the output
    with open(args.output_file, 'w', encoding='utf-8') as f:
        if args.pretty:
            json.dump(items, f, indent=2, ensure_ascii=False)
        else:
            json.dump(items, f, ensure_ascii=False)
    
    print(f"Processed {len(items)} items and saved to {args.output_file}")


if __name__ == "__main__":
    main()