#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "bs4",
# ]
# ///
"""
odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs

This script processes JSON files generated by odoc-driver (OCaml documentation generator)
and produces a single Markdown file with the essential module structure and signatures
formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
"""

import os
import sys
import json
import re
from bs4 import BeautifulSoup
from collections import defaultdict
import argparse
from pathlib import Path
import html


def extract_module_info(json_content):
    """Extract module information from odoc JSON content."""
    try:
        data = json.loads(json_content)
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        # Return a minimal structure that won't cause errors downstream
        return {
            "name": "Unknown",
            "type": "Module",
            "breadcrumbs": [],
            "content": BeautifulSoup("", "html.parser"),
            "preamble": ""
        }
    
    # Extract module name and type from header
    header = data.get("header", "")
    soup = BeautifulSoup(header, "html.parser")
    header_text = soup.get_text().strip()
    
    # Determine module type and name
    module_type = "Module"
    if "Module type" in header_text:
        module_type = "Module type"
    elif "Class" in header_text:
        module_type = "Class"
    
    # Extract the actual module name
    module_name = ""
    code_tag = soup.find("code")
    if code_tag:
        module_name = code_tag.get_text().strip()
    else:
        # Fall back to header text with type prefix removed
        module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
    
    # Extract breadcrumbs for context
    breadcrumbs = []
    for crumb in data.get("breadcrumbs", []):
        name = crumb.get("name", "")
        if name:
            soup = BeautifulSoup(name, "html.parser")
            clean_name = soup.get_text().strip()
            # Clean up the breadcrumb text
            clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
            breadcrumbs.append(clean_name)
    
    # Extract module content
    content = data.get("content", "")
    soup = BeautifulSoup(content, "html.parser")
    
    return {
        "name": module_name,
        "type": module_type,
        "breadcrumbs": breadcrumbs,
        "content": soup,
        "preamble": data.get("preamble", "")
    }


def clean_signature_text(text):
    """Clean up signature text for better readability."""
    # Replace special arrow characters with ->
    text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->')
    
    # Replace multiple spaces with a single space, except in code blocks
    text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
    
    return text


def extract_signature_name(sig_content):
    """Extract the name of a signature (function name, type name, etc.)."""
    # For val signatures: extract function name before the first :
    match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
    if match:
        return match.group(1)
    
    # For type signatures: extract type name
    match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
    if match:
        return match.group(1)
    
    # For module signatures: extract module name
    match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
    if match:
        return match.group(1)
    
    # For class signatures: extract class name
    match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
    if match:
        return match.group(1)
    
    # For exception signatures: extract exception name
    match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
    if match:
        return match.group(1)
    
    return None


def parse_module_signature(content_soup):
    """Parse the OCaml module signature from the HTML content."""
    signatures = []
    
    # Get all the odoc-spec divs
    spec_divs = content_soup.find_all("div", class_="odoc-spec")
    
    for spec in spec_divs:
        sig_id = None
        sig_type = None
        sig_content = None
        doc_content = None
        
        # Find the actual signature
        sig_div = spec.find("div", class_="spec")
        if sig_div:
            # Get the ID for cross-referencing
            sig_id = sig_div.get("id", "")
            
            # Determine the type of signature (type, val, module, etc.)
            sig_type_span = sig_div.find("span", class_="keyword")
            if sig_type_span:
                sig_type = sig_type_span.get_text().strip()
            
            # Get the full code content
            code_tag = sig_div.find("code")
            if code_tag:
                # Extract the full OCaml signature text properly
                # We'll convert all spans to plain text while preserving structure
                for span in code_tag.find_all("span"):
                    span.replace_with(span.get_text())
                
                sig_content = clean_signature_text(code_tag.get_text())
        
        # Find documentation for this signature
        doc_div = spec.find("div", class_="spec-doc")
        if doc_div:
            # Process paragraphs and lists for documentation
            doc_parts = []
            
            # Process regular paragraphs
            for p in doc_div.find_all("p"):
                # Clean up code references in paragraph
                for code in p.find_all("code"):
                    # Convert links within code tags to plain text
                    for a in code.find_all("a"):
                        a.replace_with(a.get_text())
                    # Keep the code tag formatting
                    code_text = code.get_text()
                    code.string = code_text
                
                # Clean up the paragraph text
                p_text = clean_signature_text(p.get_text()).strip()
                if p_text:
                    doc_parts.append(p_text)
            
            # Process bulleted lists
            for ul in doc_div.find_all("ul"):
                for li in ul.find_all("li"):
                    # Check if it's a special tag like @raises, @returns, etc.
                    tag_span = li.find("span", class_="at-tag")
                    if tag_span:
                        tag_name = tag_span.get_text().strip()
                        # Remove the tag span from consideration
                        tag_span.extract()
                        # Get the rest of the content
                        li_text = clean_signature_text(li.get_text()).strip()
                        doc_parts.append(f"@{tag_name} {li_text}")
                    else:
                        # Regular list item
                        li_text = clean_signature_text(li.get_text()).strip()
                        doc_parts.append(f"- {li_text}")
            
            # Process code examples
            for pre in doc_div.find_all("pre"):
                code = pre.find("code")
                if code:
                    # Get the language class if available
                    lang = "ocaml"  # Default to OCaml
                    if "language-" in code.get("class", [""]):
                        for cls in code.get("class", []):
                            if cls.startswith("language-"):
                                lang = cls.replace("language-", "")
                    
                    # Preserve indentation and line breaks in code blocks
                    code_text = code.get_text()
                    doc_parts.append(f"```{lang}\n{code_text}\n```")
            
            if doc_parts:
                doc_content = "\n".join(doc_parts)
        
        # Only add signatures that have content
        if sig_type and sig_content:
            # Extract the name of the element (function name, type name, etc.)
            name = extract_signature_name(sig_content)
            
            # Build the full signature
            signature = {
                "id": sig_id,
                "type": sig_type,
                "name": name,
                "content": sig_content,
                "doc": doc_content
            }
            signatures.append(signature)
    
    return signatures


def generate_markdown(module_info, signatures):
    """Generate markdown documentation from parsed module information."""
    md_lines = []
    
    # Module header with breadcrumbs
    breadcrumb_path = " > ".join(module_info["breadcrumbs"])
    md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
    md_lines.append(f"**Path:** {breadcrumb_path}")
    md_lines.append("")
    
    # Add module preamble documentation if available
    if module_info["preamble"]:
        preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
        preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
        if preamble_text:
            md_lines.append(preamble_text)
            md_lines.append("")
    
    # Organize signatures by type
    sig_by_type = defaultdict(list)
    for sig in signatures:
        sig_by_type[sig["type"]].append(sig)
    
    # Process types first
    if "type" in sig_by_type:
        md_lines.append("## Types")
        for sig in sig_by_type["type"]:
            md_lines.append("")
            md_lines.append(f"### `{sig['content']}`")
            
            # Add documentation if available
            if sig["doc"]:
                md_lines.append("")
                md_lines.append(sig["doc"])
        md_lines.append("")
    
    # Process exceptions
    if "exception" in sig_by_type:
        md_lines.append("## Exceptions")
        for sig in sig_by_type["exception"]:
            md_lines.append("")
            md_lines.append(f"### `{sig['content']}`")
            
            # Add documentation if available
            if sig["doc"]:
                md_lines.append("")
                md_lines.append(sig["doc"])
        md_lines.append("")
    
    # Process values (functions)
    if "val" in sig_by_type:
        md_lines.append("## Values")
        for sig in sig_by_type["val"]:
            md_lines.append("")
            md_lines.append(f"### `{sig['content']}`")
            
            # Add documentation if available
            if sig["doc"]:
                md_lines.append("")
                md_lines.append(sig["doc"])
        md_lines.append("")
    
    # Process modules
    if "module" in sig_by_type:
        md_lines.append("## Modules")
        for sig in sig_by_type["module"]:
            md_lines.append("")
            md_lines.append(f"### `{sig['content']}`")
            
            # Add documentation if available
            if sig["doc"]:
                md_lines.append("")
                md_lines.append(sig["doc"])
        md_lines.append("")
    
    # Process classes
    if "class" in sig_by_type:
        md_lines.append("## Classes")
        for sig in sig_by_type["class"]:
            md_lines.append("")
            md_lines.append(f"### `{sig['content']}`")
            
            # Add documentation if available
            if sig["doc"]:
                md_lines.append("")
                md_lines.append(sig["doc"])
        md_lines.append("")
    
    # Process remaining signature types
    for sig_type, sigs in sig_by_type.items():
        if sig_type not in ["type", "val", "module", "class", "exception"]:
            md_lines.append(f"## {sig_type.capitalize()}s")
            for sig in sigs:
                md_lines.append("")
                md_lines.append(f"### `{sig['content']}`")
                
                # Add documentation if available
                if sig["doc"]:
                    md_lines.append("")
                    md_lines.append(sig["doc"])
            md_lines.append("")
    
    return "\n".join(md_lines)


def read_json_file(file_path):
    """
    Read a JSON file with robust error handling for encoding issues.
    
    Args:
        file_path: Path to the JSON file
        
    Returns:
        Content of the JSON file as a string, or None if there was an error
    """
    # Try UTF-8 first (most common encoding)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        # Try other encodings if UTF-8 fails
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)
            return None


def build_module_hierarchy(json_files, root_dir):
    """Build a hierarchical structure from all the JSON files."""
    hierarchy = defaultdict(list)
    
    for json_file in json_files:
        rel_path = os.path.relpath(json_file, root_dir)
        package_parts = rel_path.split(os.sep)
        
        # Skip irrelevant JSON files
        if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
            # For index.html.json, check if it's a module documentation
            if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
                json_content = read_json_file(json_file)
                if json_content:
                    try:
                        # Try to parse the module info
                        module_info = extract_module_info(json_content)
                        signatures = parse_module_signature(module_info["content"])
                        
                        # Determine package name and version from path
                        package_name, package_version = determine_package_info(json_file, package_parts, module_info)
                        
                        # Use package name and version for the hierarchy key
                        package_key = f"{package_name}"
                        if package_version != "unknown":
                            # Add version information to module_info for display in markdown
                            module_info["package_version"] = package_version
                        
                        hierarchy[package_key].append({
                            "file": json_file,
                            "module_info": module_info,
                            "signatures": signatures,
                            "path_parts": package_parts
                        })
                    except Exception as e:
                        print(f"Error processing {json_file}: {e}", file=sys.stderr)
            
            continue
        
        # Try to parse other JSON files (non-index.html.json)
        json_content = read_json_file(json_file)
        if json_content:
            try:
                module_info = extract_module_info(json_content)
                signatures = parse_module_signature(module_info["content"])
                
                # Determine package name from path
                package_name = determine_package_name(package_parts, module_info)
                
                hierarchy[package_name].append({
                    "file": json_file,
                    "module_info": module_info,
                    "signatures": signatures,
                    "path_parts": package_parts
                })
            except Exception as e:
                print(f"Error processing {json_file}: {e}", file=sys.stderr)
    
    return hierarchy


def determine_package_info(file_path, path_parts, module_info):
    """
    Determine package name and version from file path and module info.
    
    Args:
        file_path: The full file path
        path_parts: Parts of the path
        module_info: Extracted module information
        
    Returns:
        Tuple of (package_name, package_version)
    """
    package_name = "unknown"
    package_version = "unknown"
    
    # Try to extract from breadcrumbs if available
    if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):
        for crumb in module_info["breadcrumbs"]:
            if "Library" in crumb:
                # Extract library name from the breadcrumb
                match = re.search(r'Library\s+(.+)', crumb)
                if match:
                    package_name = match.group(1).strip()
    
    # Look for test/package-name/version pattern in the path
    file_path_parts = Path(file_path).resolve().parts
    for i, part in enumerate(file_path_parts):
        if part == "test" and i + 2 < len(file_path_parts):
            # We found a test directory, extract package name and version
            package_name = file_path_parts[i + 1]
            package_version = file_path_parts[i + 2]
            break
    
    # If still unknown, fall back to using the first part of the path
    if package_name == "unknown" and len(path_parts) > 0:
        package_name = path_parts[0]
    
    # Last resort - use module name or "unknown"
    if package_name == "unknown":
        package_name = module_info["name"] if module_info["name"] else "unknown"
    
    return package_name, package_version


def sort_modules_hierarchically(modules):
    """Sort modules to ensure proper hierarchical presentation."""
    # First sort by breadcrumb length (shorter = higher in hierarchy)
    # Then sort alphabetically within the same level
    return sorted(modules, key=lambda x: (
        len(x["module_info"]["breadcrumbs"]), 
        x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
    ))


def generate_markdown_library(lib_name, modules):
    """Generate markdown for a specific library."""
    md_lines = []
    
    md_lines.append(f"# Library: {lib_name}")
    md_lines.append("")
    
    # Sort modules hierarchically
    sorted_modules = sort_modules_hierarchically(modules)
    
    for module in sorted_modules:
        module_md = generate_markdown(module["module_info"], module["signatures"])
        md_lines.append(module_md)
        md_lines.append("\n---\n")
    
    return "\n".join(md_lines)


def main():
    """
    Main entry point for the script.
    
    Usage examples:
    
    # Process all packages in a directory
    python odoc2llm.py /path/to/odoc/output
    
    # Process all packages and specify output file
    python odoc2llm.py /path/to/odoc/output --output documentation.md
    
    # Process a specific package only
    python odoc2llm.py /path/to/odoc/output --package package-name
    
    # Enable verbose output
    python odoc2llm.py /path/to/odoc/output --verbose
    """
    parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
    parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
    parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
    parser.add_argument('--package', '-p', help='Focus on a specific package/library')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
    args = parser.parse_args()
    
    html_dir = Path(args.html_dir)
    
    if not html_dir.exists() or not html_dir.is_dir():
        print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
        sys.exit(1)
    
    # Find all JSON files
    json_files = []
    for root, _, files in os.walk(html_dir):
        for file in files:
            if file.endswith('.html.json'):
                json_files.append(os.path.join(root, file))
    
    if args.verbose:
        print(f"Found {len(json_files)} JSON files", file=sys.stderr)
    
    # Build module hierarchy
    hierarchy = build_module_hierarchy(json_files, html_dir)
    
    if args.verbose:
        print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
        for lib, modules in hierarchy.items():
            print(f"  - {lib}: {len(modules)} modules", file=sys.stderr)
    
    # Generate markdown for all or specific package
    if args.package and args.package in hierarchy:
        markdown = generate_markdown_library(args.package, hierarchy[args.package])
    else:
        # Combine all packages
        markdown_parts = []
        for lib_name, modules in sorted(hierarchy.items()):
            if args.verbose:
                print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
            lib_md = generate_markdown_library(lib_name, modules)
            markdown_parts.append(lib_md)
            markdown_parts.append("\n\n")
        
        markdown = "\n".join(markdown_parts)
    
    # Write markdown to output file
    with open(args.output, 'w', encoding='utf-8') as f:
        f.write(markdown)
    
    print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)


if __name__ == "__main__":
    main()