#!/usr/bin/env python3 # /// script # requires-python = ">=3.11" # dependencies = [ # "bs4", # ] # /// """ odoc2json.py - Convert odoc JSON output to structured JSON records This script parses the JSON output files from odoc-driver (an OCaml documentation generator) and converts them into structured JSON records that include package name, version, and each function signature with associated documentation. The output is intended for further processing, analysis, and search over OCaml type signatures, especially for loading into columnar formats like Parquet. """ import os import json import re from bs4 import BeautifulSoup from typing import Dict, List, Any, Optional, Tuple import argparse from pathlib import Path def extract_package_info(path: str) -> Tuple[str, str]: """ Extract package name and version from the path. Args: path: Path to the odoc output directory Returns: Tuple of (package_name, package_version) """ # Use Path for more reliable path parsing p = Path(path).resolve() parts = list(p.parts) # If the path is in the format ".../package_name/version/..." if len(parts) >= 2: # The package name is typically the second-to-last component # The version is typically the last component return parts[-2], parts[-1] elif len(parts) == 1: # If only one component, assume it's the package name return parts[0], "unknown" else: return "unknown", "unknown" def parse_html_content(content: str) -> List[Dict[str, Any]]: """ Parse the HTML content from the odoc JSON to extract signatures and documentation. Args: content: HTML content from the odoc JSON file Returns: List of dictionaries containing extracted information """ soup = BeautifulSoup(content, 'html.parser') result = [] # Process each specification block (function, type, module, etc.) for spec in soup.find_all(class_="odoc-spec"): item = {} # Get the spec element (contains the signature) spec_elem = spec.find(class_="spec") if not spec_elem: continue # Determine the kind of element kind = None for cls in spec_elem.get('class', []): if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']: kind = cls break if not kind: continue item['kind'] = kind # Extract the signature first to use for name extraction if needed code_elem = spec_elem.find('code') signature = "" if code_elem: # Get the full signature text and strip all newlines and normalize whitespace signature = code_elem.get_text() # Extract the name name = None # First try to get name from anchor ID anchor = spec_elem.find('a', class_="anchor") if anchor and anchor.get('id'): item_id = anchor.get('id') # Clean up the ID to get the name name = item_id.split('.')[-1] if '.' in item_id else item_id # Remove prefixes like 'type-', 'val-', etc. name = re.sub(r'^(type|val|module|class|exception)-', '', name) # For values (functions), extract the name from signature as a fallback # This handles cases where the anchor doesn't contain the function name if kind == 'value' and not name and signature: # Look for "val name :" pattern in the signature val_match = re.search(r'val\s+(\w+)\s*:', signature) if val_match: name = val_match.group(1) if name: item['name'] = name # Add the processed signature if signature: # Replace newlines and multiple whitespace with a single space signature = re.sub(r'\s+', ' ', signature) item['signature'] = signature.strip() # Extract documentation doc_elem = spec.find(class_="spec-doc") if doc_elem: # Get the raw HTML content and remove all HTML tags html_content = str(doc_elem) # First, convert
tags to spaces html_content = re.sub(r'', ' ', html_content) # Parse the modified HTML soup_doc = BeautifulSoup(html_content, 'html.parser') # Get text with all whitespace normalized doc = soup_doc.get_text() # Replace all newlines and multiple spaces with a single space doc = re.sub(r'\s+', ' ', doc) item['documentation'] = doc.strip() # Add the item to our results result.append(item) return result def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]: """ Process a single odoc JSON file and extract the relevant information. Args: file_path: Path to the JSON file package_name: Name of the package package_version: Version of the package Returns: List of dictionaries containing extracted information """ with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) except json.JSONDecodeError: print(f"Error decoding JSON from {file_path}") return [] if 'content' not in data: return [] # Extract module path from breadcrumbs module_path = [] if 'breadcrumbs' in data: for crumb in data['breadcrumbs']: if crumb.get('kind') == 'module': module_path.append(crumb.get('name')) module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path)) # Extract items from the content items = parse_html_content(data['content']) # Add package and module information to each item for item in items: item['package_name'] = package_name item['package_version'] = package_version item['module_name'] = module_name # Create a full path for the item that includes the item name # - module_name: just the module hierarchy (e.g., "Math.Operations") # - full_path: complete path including item name (e.g., "Math.Operations.add") if 'name' in item: item['full_path'] = f"{module_name}.{item['name']}" else: item['full_path'] = module_name return items def process_directory(directory: str) -> List[Dict[str, Any]]: """ Process all JSON files in a directory recursively. Args: directory: Path to the directory containing odoc JSON files Returns: List of all extracted items from all files """ all_items = [] package_name, package_version = extract_package_info(directory) for root, _, files in os.walk(directory): for file in files: if file.endswith('.html.json'): file_path = os.path.join(root, file) items = process_json_file(file_path, package_name, package_version) all_items.extend(items) return all_items def main(): parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records') parser.add_argument('input_dir', help='Directory containing odoc JSON output') parser.add_argument('output_file', help='Output JSON file path') parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output') args = parser.parse_args() # Process all files in the directory items = process_directory(args.input_dir) # Write the output with open(args.output_file, 'w', encoding='utf-8') as f: if args.pretty: json.dump(items, f, indent=2, ensure_ascii=False) else: json.dump(items, f, ensure_ascii=False) print(f"Processed {len(items)} items and saved to {args.output_file}") if __name__ == "__main__": main()