Take OCaml odoc output into MCP
1#!/usr/bin/env python3 2# /// script 3# requires-python = ">=3.11" 4# dependencies = [ 5# "bs4", 6# ] 7# /// 8""" 9odoc2json.py - Convert odoc JSON output to structured JSON records 10 11This script parses the JSON output files from odoc-driver (an OCaml documentation 12generator) and converts them into structured JSON records that include package name, 13version, and each function signature with associated documentation. 14 15The output is intended for further processing, analysis, and search over OCaml type 16signatures, especially for loading into columnar formats like Parquet. 17""" 18 19import os 20import json 21import re 22from bs4 import BeautifulSoup 23from typing import Dict, List, Any, Optional, Tuple 24import argparse 25from pathlib import Path 26 27 28def extract_package_info(path: str) -> Tuple[str, str]: 29 """ 30 Extract package name and version from the path. 31 32 Args: 33 path: Path to the odoc output directory 34 35 Returns: 36 Tuple of (package_name, package_version) 37 """ 38 # Use Path for more reliable path parsing 39 p = Path(path).resolve() 40 parts = list(p.parts) 41 42 # If the path is in the format ".../package_name/version/..." 43 if len(parts) >= 2: 44 # The package name is typically the second-to-last component 45 # The version is typically the last component 46 return parts[-2], parts[-1] 47 elif len(parts) == 1: 48 # If only one component, assume it's the package name 49 return parts[0], "unknown" 50 else: 51 return "unknown", "unknown" 52 53 54def parse_html_content(content: str) -> List[Dict[str, Any]]: 55 """ 56 Parse the HTML content from the odoc JSON to extract signatures and documentation. 57 58 Args: 59 content: HTML content from the odoc JSON file 60 61 Returns: 62 List of dictionaries containing extracted information 63 """ 64 soup = BeautifulSoup(content, 'html.parser') 65 result = [] 66 67 # Process each specification block (function, type, module, etc.) 68 for spec in soup.find_all(class_="odoc-spec"): 69 item = {} 70 71 # Get the spec element (contains the signature) 72 spec_elem = spec.find(class_="spec") 73 if not spec_elem: 74 continue 75 76 # Determine the kind of element 77 kind = None 78 for cls in spec_elem.get('class', []): 79 if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']: 80 kind = cls 81 break 82 83 if not kind: 84 continue 85 86 item['kind'] = kind 87 88 # Extract the signature first to use for name extraction if needed 89 code_elem = spec_elem.find('code') 90 signature = "" 91 if code_elem: 92 # Get the full signature text and strip all newlines and normalize whitespace 93 signature = code_elem.get_text() 94 95 # Extract the name 96 name = None 97 98 # First try to get name from anchor ID 99 anchor = spec_elem.find('a', class_="anchor") 100 if anchor and anchor.get('id'): 101 item_id = anchor.get('id') 102 # Clean up the ID to get the name 103 name = item_id.split('.')[-1] if '.' in item_id else item_id 104 # Remove prefixes like 'type-', 'val-', etc. 105 name = re.sub(r'^(type|val|module|class|exception)-', '', name) 106 107 # For values (functions), extract the name from signature as a fallback 108 # This handles cases where the anchor doesn't contain the function name 109 if kind == 'value' and not name and signature: 110 # Look for "val name :" pattern in the signature 111 val_match = re.search(r'val\s+(\w+)\s*:', signature) 112 if val_match: 113 name = val_match.group(1) 114 115 if name: 116 item['name'] = name 117 118 # Add the processed signature 119 if signature: 120 # Replace newlines and multiple whitespace with a single space 121 signature = re.sub(r'\s+', ' ', signature) 122 item['signature'] = signature.strip() 123 124 # Extract documentation 125 doc_elem = spec.find(class_="spec-doc") 126 if doc_elem: 127 # Get the raw HTML content and remove all HTML tags 128 html_content = str(doc_elem) 129 # First, convert <br> tags to spaces 130 html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content) 131 # Parse the modified HTML 132 soup_doc = BeautifulSoup(html_content, 'html.parser') 133 # Get text with all whitespace normalized 134 doc = soup_doc.get_text() 135 # Replace all newlines and multiple spaces with a single space 136 doc = re.sub(r'\s+', ' ', doc) 137 item['documentation'] = doc.strip() 138 139 # Add the item to our results 140 result.append(item) 141 142 return result 143 144 145def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]: 146 """ 147 Process a single odoc JSON file and extract the relevant information. 148 149 Args: 150 file_path: Path to the JSON file 151 package_name: Name of the package 152 package_version: Version of the package 153 154 Returns: 155 List of dictionaries containing extracted information 156 """ 157 with open(file_path, 'r', encoding='utf-8') as f: 158 try: 159 data = json.load(f) 160 except json.JSONDecodeError: 161 print(f"Error decoding JSON from {file_path}") 162 return [] 163 164 if 'content' not in data: 165 return [] 166 167 # Extract module path from breadcrumbs 168 module_path = [] 169 if 'breadcrumbs' in data: 170 for crumb in data['breadcrumbs']: 171 if crumb.get('kind') == 'module': 172 module_path.append(crumb.get('name')) 173 174 module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path)) 175 176 # Extract items from the content 177 items = parse_html_content(data['content']) 178 179 # Add package and module information to each item 180 for item in items: 181 item['package_name'] = package_name 182 item['package_version'] = package_version 183 item['module_name'] = module_name 184 185 # Create a full path for the item that includes the item name 186 # - module_name: just the module hierarchy (e.g., "Math.Operations") 187 # - full_path: complete path including item name (e.g., "Math.Operations.add") 188 if 'name' in item: 189 item['full_path'] = f"{module_name}.{item['name']}" 190 else: 191 item['full_path'] = module_name 192 193 return items 194 195 196def process_directory(directory: str) -> List[Dict[str, Any]]: 197 """ 198 Process all JSON files in a directory recursively. 199 200 Args: 201 directory: Path to the directory containing odoc JSON files 202 203 Returns: 204 List of all extracted items from all files 205 """ 206 all_items = [] 207 package_name, package_version = extract_package_info(directory) 208 209 for root, _, files in os.walk(directory): 210 for file in files: 211 if file.endswith('.html.json'): 212 file_path = os.path.join(root, file) 213 items = process_json_file(file_path, package_name, package_version) 214 all_items.extend(items) 215 216 return all_items 217 218 219def main(): 220 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records') 221 parser.add_argument('input_dir', help='Directory containing odoc JSON output') 222 parser.add_argument('output_file', help='Output JSON file path') 223 parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output') 224 args = parser.parse_args() 225 226 # Process all files in the directory 227 items = process_directory(args.input_dir) 228 229 # Write the output 230 with open(args.output_file, 'w', encoding='utf-8') as f: 231 if args.pretty: 232 json.dump(items, f, indent=2, ensure_ascii=False) 233 else: 234 json.dump(items, f, ensure_ascii=False) 235 236 print(f"Processed {len(items)} items and saved to {args.output_file}") 237 238 239if __name__ == "__main__": 240 main()