odoc2json.py at 4177be63b2e3505e8ac21849f8d2cbdf21aa71a5 · anil.recoil.org/odoc-mcp

anil.recoil.org / odoc-mcp
Take OCaml odoc output into MCP
odoc-mcp / odoc2json.py
at 4177be63b2e3505e8ac21849f8d2cbdf21aa71a5 8.2 kB view raw
  1#!/usr/bin/env python3
  2# /// script
  3# requires-python = ">=3.11"
  4# dependencies = [
  5#   "bs4",
  6# ]
  7# ///
  8"""
  9odoc2json.py - Convert odoc JSON output to structured JSON records
 10
 11This script parses the JSON output files from odoc-driver (an OCaml documentation
 12generator) and converts them into structured JSON records that include package name,
 13version, and each function signature with associated documentation.
 14
 15The output is intended for further processing, analysis, and search over OCaml type 
 16signatures, especially for loading into columnar formats like Parquet.
 17"""
 18
 19import os
 20import json
 21import re
 22from bs4 import BeautifulSoup
 23from typing import Dict, List, Any, Optional, Tuple
 24import argparse
 25from pathlib import Path
 26
 27
 28def extract_package_info(path: str) -> Tuple[str, str]:
 29    """
 30    Extract package name and version from the path.
 31    
 32    Args:
 33        path: Path to the odoc output directory
 34        
 35    Returns:
 36        Tuple of (package_name, package_version)
 37    """
 38    # Use Path for more reliable path parsing
 39    p = Path(path).resolve()
 40    parts = list(p.parts)
 41    
 42    # If the path is in the format ".../package_name/version/..."
 43    if len(parts) >= 2:
 44        # The package name is typically the second-to-last component
 45        # The version is typically the last component
 46        return parts[-2], parts[-1]
 47    elif len(parts) == 1:
 48        # If only one component, assume it's the package name
 49        return parts[0], "unknown"
 50    else:
 51        return "unknown", "unknown"
 52
 53
 54def parse_html_content(content: str) -> List[Dict[str, Any]]:
 55    """
 56    Parse the HTML content from the odoc JSON to extract signatures and documentation.
 57    
 58    Args:
 59        content: HTML content from the odoc JSON file
 60        
 61    Returns:
 62        List of dictionaries containing extracted information
 63    """
 64    soup = BeautifulSoup(content, 'html.parser')
 65    result = []
 66    
 67    # Process each specification block (function, type, module, etc.)
 68    for spec in soup.find_all(class_="odoc-spec"):
 69        item = {}
 70        
 71        # Get the spec element (contains the signature)
 72        spec_elem = spec.find(class_="spec")
 73        if not spec_elem:
 74            continue
 75            
 76        # Determine the kind of element
 77        kind = None
 78        for cls in spec_elem.get('class', []):
 79            if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
 80                kind = cls
 81                break
 82                
 83        if not kind:
 84            continue
 85            
 86        item['kind'] = kind
 87        
 88        # Extract the signature first to use for name extraction if needed
 89        code_elem = spec_elem.find('code')
 90        signature = ""
 91        if code_elem:
 92            # Get the full signature text and strip all newlines and normalize whitespace
 93            signature = code_elem.get_text()
 94        
 95        # Extract the name
 96        name = None
 97        
 98        # First try to get name from anchor ID
 99        anchor = spec_elem.find('a', class_="anchor")
100        if anchor and anchor.get('id'):
101            item_id = anchor.get('id')
102            # Clean up the ID to get the name
103            name = item_id.split('.')[-1] if '.' in item_id else item_id
104            # Remove prefixes like 'type-', 'val-', etc.
105            name = re.sub(r'^(type|val|module|class|exception)-', '', name)
106        
107        # For values (functions), extract the name from signature as a fallback
108        # This handles cases where the anchor doesn't contain the function name
109        if kind == 'value' and not name and signature:
110            # Look for "val name :" pattern in the signature
111            val_match = re.search(r'val\s+(\w+)\s*:', signature)
112            if val_match:
113                name = val_match.group(1)
114        
115        if name:
116            item['name'] = name
117        
118        # Add the processed signature
119        if signature:
120            # Replace newlines and multiple whitespace with a single space
121            signature = re.sub(r'\s+', ' ', signature)
122            item['signature'] = signature.strip()
123            
124        # Extract documentation
125        doc_elem = spec.find(class_="spec-doc")
126        if doc_elem:
127            # Get the raw HTML content and remove all HTML tags
128            html_content = str(doc_elem)
129            # First, convert <br> tags to spaces
130            html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
131            # Parse the modified HTML
132            soup_doc = BeautifulSoup(html_content, 'html.parser')
133            # Get text with all whitespace normalized
134            doc = soup_doc.get_text()
135            # Replace all newlines and multiple spaces with a single space
136            doc = re.sub(r'\s+', ' ', doc)
137            item['documentation'] = doc.strip()
138            
139        # Add the item to our results
140        result.append(item)
141        
142    return result
143
144
145def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
146    """
147    Process a single odoc JSON file and extract the relevant information.
148    
149    Args:
150        file_path: Path to the JSON file
151        package_name: Name of the package
152        package_version: Version of the package
153        
154    Returns:
155        List of dictionaries containing extracted information
156    """
157    with open(file_path, 'r', encoding='utf-8') as f:
158        try:
159            data = json.load(f)
160        except json.JSONDecodeError:
161            print(f"Error decoding JSON from {file_path}")
162            return []
163    
164    if 'content' not in data:
165        return []
166
167    # Extract module path from breadcrumbs
168    module_path = []
169    if 'breadcrumbs' in data:
170        for crumb in data['breadcrumbs']:
171            if crumb.get('kind') == 'module':
172                module_path.append(crumb.get('name'))
173    
174    module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
175    
176    # Extract items from the content
177    items = parse_html_content(data['content'])
178    
179    # Add package and module information to each item
180    for item in items:
181        item['package_name'] = package_name
182        item['package_version'] = package_version
183        item['module_name'] = module_name
184        
185        # Create a full path for the item that includes the item name
186        # - module_name: just the module hierarchy (e.g., "Math.Operations")
187        # - full_path: complete path including item name (e.g., "Math.Operations.add")
188        if 'name' in item:
189            item['full_path'] = f"{module_name}.{item['name']}"
190        else:
191            item['full_path'] = module_name
192
193    return items
194
195
196def process_directory(directory: str) -> List[Dict[str, Any]]:
197    """
198    Process all JSON files in a directory recursively.
199    
200    Args:
201        directory: Path to the directory containing odoc JSON files
202        
203    Returns:
204        List of all extracted items from all files
205    """
206    all_items = []
207    package_name, package_version = extract_package_info(directory)
208    
209    for root, _, files in os.walk(directory):
210        for file in files:
211            if file.endswith('.html.json'):
212                file_path = os.path.join(root, file)
213                items = process_json_file(file_path, package_name, package_version)
214                all_items.extend(items)
215    
216    return all_items
217
218
219def main():
220    parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
221    parser.add_argument('input_dir', help='Directory containing odoc JSON output')
222    parser.add_argument('output_file', help='Output JSON file path')
223    parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
224    args = parser.parse_args()
225    
226    # Process all files in the directory
227    items = process_directory(args.input_dir)
228    
229    # Write the output
230    with open(args.output_file, 'w', encoding='utf-8') as f:
231        if args.pretty:
232            json.dump(items, f, indent=2, ensure_ascii=False)
233        else:
234            json.dump(items, f, ensure_ascii=False)
235    
236    print(f"Processed {len(items)} items and saved to {args.output_file}")
237
238
239if __name__ == "__main__":
240    main()