Take OCaml odoc output into MCP
1#!/usr/bin/env python3
2# /// script
3# requires-python = ">=3.11"
4# dependencies = [
5# "bs4",
6# ]
7# ///
8"""
9odoc2json.py - Convert odoc JSON output to structured JSON records
10
11This script parses the JSON output files from odoc-driver (an OCaml documentation
12generator) and converts them into structured JSON records that include package name,
13version, and each function signature with associated documentation.
14
15The output is intended for further processing, analysis, and search over OCaml type
16signatures, especially for loading into columnar formats like Parquet.
17"""
18
19import os
20import json
21import re
22from bs4 import BeautifulSoup
23from typing import Dict, List, Any, Optional, Tuple
24import argparse
25from pathlib import Path
26
27
28def extract_package_info(path: str) -> Tuple[str, str]:
29 """
30 Extract package name and version from the path.
31
32 Args:
33 path: Path to the odoc output directory
34
35 Returns:
36 Tuple of (package_name, package_version)
37 """
38 # Use Path for more reliable path parsing
39 p = Path(path).resolve()
40 parts = list(p.parts)
41
42 # If the path is in the format ".../package_name/version/..."
43 if len(parts) >= 2:
44 # The package name is typically the second-to-last component
45 # The version is typically the last component
46 return parts[-2], parts[-1]
47 elif len(parts) == 1:
48 # If only one component, assume it's the package name
49 return parts[0], "unknown"
50 else:
51 return "unknown", "unknown"
52
53
54def parse_html_content(content: str) -> List[Dict[str, Any]]:
55 """
56 Parse the HTML content from the odoc JSON to extract signatures and documentation.
57
58 Args:
59 content: HTML content from the odoc JSON file
60
61 Returns:
62 List of dictionaries containing extracted information
63 """
64 soup = BeautifulSoup(content, 'html.parser')
65 result = []
66
67 # Process each specification block (function, type, module, etc.)
68 for spec in soup.find_all(class_="odoc-spec"):
69 item = {}
70
71 # Get the spec element (contains the signature)
72 spec_elem = spec.find(class_="spec")
73 if not spec_elem:
74 continue
75
76 # Determine the kind of element
77 kind = None
78 for cls in spec_elem.get('class', []):
79 if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
80 kind = cls
81 break
82
83 if not kind:
84 continue
85
86 item['kind'] = kind
87
88 # Extract the signature first to use for name extraction if needed
89 code_elem = spec_elem.find('code')
90 signature = ""
91 if code_elem:
92 # Get the full signature text and strip all newlines and normalize whitespace
93 signature = code_elem.get_text()
94
95 # Extract the name
96 name = None
97
98 # First try to get name from anchor ID
99 anchor = spec_elem.find('a', class_="anchor")
100 if anchor and anchor.get('id'):
101 item_id = anchor.get('id')
102 # Clean up the ID to get the name
103 name = item_id.split('.')[-1] if '.' in item_id else item_id
104 # Remove prefixes like 'type-', 'val-', etc.
105 name = re.sub(r'^(type|val|module|class|exception)-', '', name)
106
107 # For values (functions), extract the name from signature as a fallback
108 # This handles cases where the anchor doesn't contain the function name
109 if kind == 'value' and not name and signature:
110 # Look for "val name :" pattern in the signature
111 val_match = re.search(r'val\s+(\w+)\s*:', signature)
112 if val_match:
113 name = val_match.group(1)
114
115 if name:
116 item['name'] = name
117
118 # Add the processed signature
119 if signature:
120 # Replace newlines and multiple whitespace with a single space
121 signature = re.sub(r'\s+', ' ', signature)
122 item['signature'] = signature.strip()
123
124 # Extract documentation
125 doc_elem = spec.find(class_="spec-doc")
126 if doc_elem:
127 # Get the raw HTML content and remove all HTML tags
128 html_content = str(doc_elem)
129 # First, convert <br> tags to spaces
130 html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
131 # Parse the modified HTML
132 soup_doc = BeautifulSoup(html_content, 'html.parser')
133 # Get text with all whitespace normalized
134 doc = soup_doc.get_text()
135 # Replace all newlines and multiple spaces with a single space
136 doc = re.sub(r'\s+', ' ', doc)
137 item['documentation'] = doc.strip()
138
139 # Add the item to our results
140 result.append(item)
141
142 return result
143
144
145def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
146 """
147 Process a single odoc JSON file and extract the relevant information.
148
149 Args:
150 file_path: Path to the JSON file
151 package_name: Name of the package
152 package_version: Version of the package
153
154 Returns:
155 List of dictionaries containing extracted information
156 """
157 with open(file_path, 'r', encoding='utf-8') as f:
158 try:
159 data = json.load(f)
160 except json.JSONDecodeError:
161 print(f"Error decoding JSON from {file_path}")
162 return []
163
164 if 'content' not in data:
165 return []
166
167 # Extract module path from breadcrumbs
168 module_path = []
169 if 'breadcrumbs' in data:
170 for crumb in data['breadcrumbs']:
171 if crumb.get('kind') == 'module':
172 module_path.append(crumb.get('name'))
173
174 module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
175
176 # Extract items from the content
177 items = parse_html_content(data['content'])
178
179 # Add package and module information to each item
180 for item in items:
181 item['package_name'] = package_name
182 item['package_version'] = package_version
183 item['module_name'] = module_name
184
185 # Create a full path for the item that includes the item name
186 # - module_name: just the module hierarchy (e.g., "Math.Operations")
187 # - full_path: complete path including item name (e.g., "Math.Operations.add")
188 if 'name' in item:
189 item['full_path'] = f"{module_name}.{item['name']}"
190 else:
191 item['full_path'] = module_name
192
193 return items
194
195
196def process_directory(directory: str) -> List[Dict[str, Any]]:
197 """
198 Process all JSON files in a directory recursively.
199
200 Args:
201 directory: Path to the directory containing odoc JSON files
202
203 Returns:
204 List of all extracted items from all files
205 """
206 all_items = []
207 package_name, package_version = extract_package_info(directory)
208
209 for root, _, files in os.walk(directory):
210 for file in files:
211 if file.endswith('.html.json'):
212 file_path = os.path.join(root, file)
213 items = process_json_file(file_path, package_name, package_version)
214 all_items.extend(items)
215
216 return all_items
217
218
219def main():
220 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
221 parser.add_argument('input_dir', help='Directory containing odoc JSON output')
222 parser.add_argument('output_file', help='Output JSON file path')
223 parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
224 args = parser.parse_args()
225
226 # Process all files in the directory
227 items = process_directory(args.input_dir)
228
229 # Write the output
230 with open(args.output_file, 'w', encoding='utf-8') as f:
231 if args.pretty:
232 json.dump(items, f, indent=2, ensure_ascii=False)
233 else:
234 json.dump(items, f, ensure_ascii=False)
235
236 print(f"Processed {len(items)} items and saved to {args.output_file}")
237
238
239if __name__ == "__main__":
240 main()