#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "bs4",
# ]
# ///
"""
odoc2json.py - Convert odoc JSON output to structured JSON records
This script parses the JSON output files from odoc-driver (an OCaml documentation
generator) and converts them into structured JSON records that include package name,
version, and each function signature with associated documentation.
The output is intended for further processing, analysis, and search over OCaml type
signatures, especially for loading into columnar formats like Parquet.
"""
import os
import json
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
import argparse
from pathlib import Path
def extract_package_info(path: str) -> Tuple[str, str]:
"""
Extract package name and version from the path.
Args:
path: Path to the odoc output directory
Returns:
Tuple of (package_name, package_version)
"""
# Use Path for more reliable path parsing
p = Path(path).resolve()
parts = list(p.parts)
# If the path is in the format ".../package_name/version/..."
if len(parts) >= 2:
# The package name is typically the second-to-last component
# The version is typically the last component
return parts[-2], parts[-1]
elif len(parts) == 1:
# If only one component, assume it's the package name
return parts[0], "unknown"
else:
return "unknown", "unknown"
def parse_html_content(content: str) -> List[Dict[str, Any]]:
"""
Parse the HTML content from the odoc JSON to extract signatures and documentation.
Args:
content: HTML content from the odoc JSON file
Returns:
List of dictionaries containing extracted information
"""
soup = BeautifulSoup(content, 'html.parser')
result = []
# Process each specification block (function, type, module, etc.)
for spec in soup.find_all(class_="odoc-spec"):
item = {}
# Get the spec element (contains the signature)
spec_elem = spec.find(class_="spec")
if not spec_elem:
continue
# Determine the kind of element
kind = None
for cls in spec_elem.get('class', []):
if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
kind = cls
break
if not kind:
continue
item['kind'] = kind
# Extract the signature first to use for name extraction if needed
code_elem = spec_elem.find('code')
signature = ""
if code_elem:
# Get the full signature text and strip all newlines and normalize whitespace
signature = code_elem.get_text()
# Extract the name
name = None
# First try to get name from anchor ID
anchor = spec_elem.find('a', class_="anchor")
if anchor and anchor.get('id'):
item_id = anchor.get('id')
# Clean up the ID to get the name
name = item_id.split('.')[-1] if '.' in item_id else item_id
# Remove prefixes like 'type-', 'val-', etc.
name = re.sub(r'^(type|val|module|class|exception)-', '', name)
# For values (functions), extract the name from signature as a fallback
# This handles cases where the anchor doesn't contain the function name
if kind == 'value' and not name and signature:
# Look for "val name :" pattern in the signature
val_match = re.search(r'val\s+(\w+)\s*:', signature)
if val_match:
name = val_match.group(1)
if name:
item['name'] = name
# Add the processed signature
if signature:
# Replace newlines and multiple whitespace with a single space
signature = re.sub(r'\s+', ' ', signature)
item['signature'] = signature.strip()
# Extract documentation
doc_elem = spec.find(class_="spec-doc")
if doc_elem:
# Get the raw HTML content and remove all HTML tags
html_content = str(doc_elem)
# First, convert
tags to spaces
html_content = re.sub(r'
', ' ', html_content)
# Parse the modified HTML
soup_doc = BeautifulSoup(html_content, 'html.parser')
# Get text with all whitespace normalized
doc = soup_doc.get_text()
# Replace all newlines and multiple spaces with a single space
doc = re.sub(r'\s+', ' ', doc)
item['documentation'] = doc.strip()
# Add the item to our results
result.append(item)
return result
def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
"""
Process a single odoc JSON file and extract the relevant information.
Args:
file_path: Path to the JSON file
package_name: Name of the package
package_version: Version of the package
Returns:
List of dictionaries containing extracted information
"""
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
except json.JSONDecodeError:
print(f"Error decoding JSON from {file_path}")
return []
if 'content' not in data:
return []
# Extract module path from breadcrumbs
module_path = []
if 'breadcrumbs' in data:
for crumb in data['breadcrumbs']:
if crumb.get('kind') == 'module':
module_path.append(crumb.get('name'))
module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
# Extract items from the content
items = parse_html_content(data['content'])
# Add package and module information to each item
for item in items:
item['package_name'] = package_name
item['package_version'] = package_version
item['module_name'] = module_name
# Create a full path for the item that includes the item name
# - module_name: just the module hierarchy (e.g., "Math.Operations")
# - full_path: complete path including item name (e.g., "Math.Operations.add")
if 'name' in item:
item['full_path'] = f"{module_name}.{item['name']}"
else:
item['full_path'] = module_name
return items
def process_directory(directory: str) -> List[Dict[str, Any]]:
"""
Process all JSON files in a directory recursively.
Args:
directory: Path to the directory containing odoc JSON files
Returns:
List of all extracted items from all files
"""
all_items = []
package_name, package_version = extract_package_info(directory)
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.html.json'):
file_path = os.path.join(root, file)
items = process_json_file(file_path, package_name, package_version)
all_items.extend(items)
return all_items
def main():
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
parser.add_argument('output_file', help='Output JSON file path')
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
args = parser.parse_args()
# Process all files in the directory
items = process_directory(args.input_dir)
# Write the output
with open(args.output_file, 'w', encoding='utf-8') as f:
if args.pretty:
json.dump(items, f, indent=2, ensure_ascii=False)
else:
json.dump(items, f, ensure_ascii=False)
print(f"Processed {len(items)} items and saved to {args.output_file}")
if __name__ == "__main__":
main()