Take OCaml odoc output into MCP

add a version that outputs something suitable for parquet

Changed files
+240
+240
odoc2json.py
···
+
#!/usr/bin/env python3
+
# /// script
+
# requires-python = ">=3.11"
+
# dependencies = [
+
# "bs4",
+
# ]
+
# ///
+
"""
+
odoc2json.py - Convert odoc JSON output to structured JSON records
+
+
This script parses the JSON output files from odoc-driver (an OCaml documentation
+
generator) and converts them into structured JSON records that include package name,
+
version, and each function signature with associated documentation.
+
+
The output is intended for further processing, analysis, and search over OCaml type
+
signatures, especially for loading into columnar formats like Parquet.
+
"""
+
+
import os
+
import json
+
import re
+
from bs4 import BeautifulSoup
+
from typing import Dict, List, Any, Optional, Tuple
+
import argparse
+
from pathlib import Path
+
+
+
def extract_package_info(path: str) -> Tuple[str, str]:
+
"""
+
Extract package name and version from the path.
+
+
Args:
+
path: Path to the odoc output directory
+
+
Returns:
+
Tuple of (package_name, package_version)
+
"""
+
# Use Path for more reliable path parsing
+
p = Path(path).resolve()
+
parts = list(p.parts)
+
+
# If the path is in the format ".../package_name/version/..."
+
if len(parts) >= 2:
+
# The package name is typically the second-to-last component
+
# The version is typically the last component
+
return parts[-2], parts[-1]
+
elif len(parts) == 1:
+
# If only one component, assume it's the package name
+
return parts[0], "unknown"
+
else:
+
return "unknown", "unknown"
+
+
+
def parse_html_content(content: str) -> List[Dict[str, Any]]:
+
"""
+
Parse the HTML content from the odoc JSON to extract signatures and documentation.
+
+
Args:
+
content: HTML content from the odoc JSON file
+
+
Returns:
+
List of dictionaries containing extracted information
+
"""
+
soup = BeautifulSoup(content, 'html.parser')
+
result = []
+
+
# Process each specification block (function, type, module, etc.)
+
for spec in soup.find_all(class_="odoc-spec"):
+
item = {}
+
+
# Get the spec element (contains the signature)
+
spec_elem = spec.find(class_="spec")
+
if not spec_elem:
+
continue
+
+
# Determine the kind of element
+
kind = None
+
for cls in spec_elem.get('class', []):
+
if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
+
kind = cls
+
break
+
+
if not kind:
+
continue
+
+
item['kind'] = kind
+
+
# Extract the signature first to use for name extraction if needed
+
code_elem = spec_elem.find('code')
+
signature = ""
+
if code_elem:
+
# Get the full signature text and strip all newlines and normalize whitespace
+
signature = code_elem.get_text()
+
+
# Extract the name
+
name = None
+
+
# First try to get name from anchor ID
+
anchor = spec_elem.find('a', class_="anchor")
+
if anchor and anchor.get('id'):
+
item_id = anchor.get('id')
+
# Clean up the ID to get the name
+
name = item_id.split('.')[-1] if '.' in item_id else item_id
+
# Remove prefixes like 'type-', 'val-', etc.
+
name = re.sub(r'^(type|val|module|class|exception)-', '', name)
+
+
# For values (functions), extract the name from signature as a fallback
+
# This handles cases where the anchor doesn't contain the function name
+
if kind == 'value' and not name and signature:
+
# Look for "val name :" pattern in the signature
+
val_match = re.search(r'val\s+(\w+)\s*:', signature)
+
if val_match:
+
name = val_match.group(1)
+
+
if name:
+
item['name'] = name
+
+
# Add the processed signature
+
if signature:
+
# Replace newlines and multiple whitespace with a single space
+
signature = re.sub(r'\s+', ' ', signature)
+
item['signature'] = signature.strip()
+
+
# Extract documentation
+
doc_elem = spec.find(class_="spec-doc")
+
if doc_elem:
+
# Get the raw HTML content and remove all HTML tags
+
html_content = str(doc_elem)
+
# First, convert <br> tags to spaces
+
html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
+
# Parse the modified HTML
+
soup_doc = BeautifulSoup(html_content, 'html.parser')
+
# Get text with all whitespace normalized
+
doc = soup_doc.get_text()
+
# Replace all newlines and multiple spaces with a single space
+
doc = re.sub(r'\s+', ' ', doc)
+
item['documentation'] = doc.strip()
+
+
# Add the item to our results
+
result.append(item)
+
+
return result
+
+
+
def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
+
"""
+
Process a single odoc JSON file and extract the relevant information.
+
+
Args:
+
file_path: Path to the JSON file
+
package_name: Name of the package
+
package_version: Version of the package
+
+
Returns:
+
List of dictionaries containing extracted information
+
"""
+
with open(file_path, 'r', encoding='utf-8') as f:
+
try:
+
data = json.load(f)
+
except json.JSONDecodeError:
+
print(f"Error decoding JSON from {file_path}")
+
return []
+
+
if 'content' not in data:
+
return []
+
+
# Extract module path from breadcrumbs
+
module_path = []
+
if 'breadcrumbs' in data:
+
for crumb in data['breadcrumbs']:
+
if crumb.get('kind') == 'module':
+
module_path.append(crumb.get('name'))
+
+
module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
+
+
# Extract items from the content
+
items = parse_html_content(data['content'])
+
+
# Add package and module information to each item
+
for item in items:
+
item['package_name'] = package_name
+
item['package_version'] = package_version
+
item['module_name'] = module_name
+
+
# Create a full path for the item that includes the item name
+
# - module_name: just the module hierarchy (e.g., "Math.Operations")
+
# - full_path: complete path including item name (e.g., "Math.Operations.add")
+
if 'name' in item:
+
item['full_path'] = f"{module_name}.{item['name']}"
+
else:
+
item['full_path'] = module_name
+
+
return items
+
+
+
def process_directory(directory: str) -> List[Dict[str, Any]]:
+
"""
+
Process all JSON files in a directory recursively.
+
+
Args:
+
directory: Path to the directory containing odoc JSON files
+
+
Returns:
+
List of all extracted items from all files
+
"""
+
all_items = []
+
package_name, package_version = extract_package_info(directory)
+
+
for root, _, files in os.walk(directory):
+
for file in files:
+
if file.endswith('.html.json'):
+
file_path = os.path.join(root, file)
+
items = process_json_file(file_path, package_name, package_version)
+
all_items.extend(items)
+
+
return all_items
+
+
+
def main():
+
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
+
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
+
parser.add_argument('output_file', help='Output JSON file path')
+
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
+
args = parser.parse_args()
+
+
# Process all files in the directory
+
items = process_directory(args.input_dir)
+
+
# Write the output
+
with open(args.output_file, 'w', encoding='utf-8') as f:
+
if args.pretty:
+
json.dump(items, f, indent=2, ensure_ascii=False)
+
else:
+
json.dump(items, f, ensure_ascii=False)
+
+
print(f"Processed {len(items)} items and saved to {args.output_file}")
+
+
+
if __name__ == "__main__":
+
main()