Take OCaml odoc output into MCP
1#!/usr/bin/env python3
2# /// script
3# requires-python = ">=3.11"
4# dependencies = [
5# "bs4",
6# ]
7# ///
8"""
9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
10
11This script processes JSON files generated by odoc-driver (OCaml documentation generator)
12and produces a single Markdown file with the essential module structure and signatures
13formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
14"""
15
16import os
17import sys
18import json
19import re
20from bs4 import BeautifulSoup
21from collections import defaultdict
22import argparse
23from pathlib import Path
24import html
25
26
27def extract_module_info(json_content):
28 """Extract module information from odoc JSON content."""
29 data = json.loads(json_content)
30
31 # Extract module name and type from header
32 header = data.get("header", "")
33 soup = BeautifulSoup(header, "html.parser")
34 header_text = soup.get_text().strip()
35
36 # Determine module type and name
37 module_type = "Module"
38 if "Module type" in header_text:
39 module_type = "Module type"
40 elif "Class" in header_text:
41 module_type = "Class"
42
43 # Extract the actual module name
44 module_name = ""
45 code_tag = soup.find("code")
46 if code_tag:
47 module_name = code_tag.get_text().strip()
48 else:
49 # Fall back to header text with type prefix removed
50 module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
51
52 # Extract breadcrumbs for context
53 breadcrumbs = []
54 for crumb in data.get("breadcrumbs", []):
55 name = crumb.get("name", "")
56 if name:
57 soup = BeautifulSoup(name, "html.parser")
58 clean_name = soup.get_text().strip()
59 # Clean up the breadcrumb text
60 clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
61 breadcrumbs.append(clean_name)
62
63 # Extract module content
64 content = data.get("content", "")
65 soup = BeautifulSoup(content, "html.parser")
66
67 return {
68 "name": module_name,
69 "type": module_type,
70 "breadcrumbs": breadcrumbs,
71 "content": soup,
72 "preamble": data.get("preamble", "")
73 }
74
75
76def clean_signature_text(text):
77 """Clean up signature text for better readability."""
78 # Replace special arrow characters with ->
79 text = text.replace('', '').replace('−', '-').replace('‑', '-').replace('→', '->')
80
81 # Replace multiple spaces with a single space, except in code blocks
82 text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
83
84 return text
85
86
87def extract_signature_name(sig_content):
88 """Extract the name of a signature (function name, type name, etc.)."""
89 # For val signatures: extract function name before the first :
90 match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
91 if match:
92 return match.group(1)
93
94 # For type signatures: extract type name
95 match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
96 if match:
97 return match.group(1)
98
99 # For module signatures: extract module name
100 match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
101 if match:
102 return match.group(1)
103
104 # For class signatures: extract class name
105 match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
106 if match:
107 return match.group(1)
108
109 # For exception signatures: extract exception name
110 match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
111 if match:
112 return match.group(1)
113
114 return None
115
116
117def parse_module_signature(content_soup):
118 """Parse the OCaml module signature from the HTML content."""
119 signatures = []
120
121 # Get all the odoc-spec divs
122 spec_divs = content_soup.find_all("div", class_="odoc-spec")
123
124 for spec in spec_divs:
125 sig_id = None
126 sig_type = None
127 sig_content = None
128 doc_content = None
129
130 # Find the actual signature
131 sig_div = spec.find("div", class_="spec")
132 if sig_div:
133 # Get the ID for cross-referencing
134 sig_id = sig_div.get("id", "")
135
136 # Determine the type of signature (type, val, module, etc.)
137 sig_type_span = sig_div.find("span", class_="keyword")
138 if sig_type_span:
139 sig_type = sig_type_span.get_text().strip()
140
141 # Get the full code content
142 code_tag = sig_div.find("code")
143 if code_tag:
144 # Extract the full OCaml signature text properly
145 # We'll convert all spans to plain text while preserving structure
146 for span in code_tag.find_all("span"):
147 span.replace_with(span.get_text())
148
149 sig_content = clean_signature_text(code_tag.get_text())
150
151 # Find documentation for this signature
152 doc_div = spec.find("div", class_="spec-doc")
153 if doc_div:
154 # Process paragraphs and lists for documentation
155 doc_parts = []
156
157 # Process regular paragraphs
158 for p in doc_div.find_all("p"):
159 # Clean up code references in paragraph
160 for code in p.find_all("code"):
161 # Convert links within code tags to plain text
162 for a in code.find_all("a"):
163 a.replace_with(a.get_text())
164 # Keep the code tag formatting
165 code_text = code.get_text()
166 code.string = code_text
167
168 # Clean up the paragraph text
169 p_text = clean_signature_text(p.get_text()).strip()
170 if p_text:
171 doc_parts.append(p_text)
172
173 # Process bulleted lists
174 for ul in doc_div.find_all("ul"):
175 for li in ul.find_all("li"):
176 # Check if it's a special tag like @raises, @returns, etc.
177 tag_span = li.find("span", class_="at-tag")
178 if tag_span:
179 tag_name = tag_span.get_text().strip()
180 # Remove the tag span from consideration
181 tag_span.extract()
182 # Get the rest of the content
183 li_text = clean_signature_text(li.get_text()).strip()
184 doc_parts.append(f"@{tag_name} {li_text}")
185 else:
186 # Regular list item
187 li_text = clean_signature_text(li.get_text()).strip()
188 doc_parts.append(f"- {li_text}")
189
190 # Process code examples
191 for pre in doc_div.find_all("pre"):
192 code = pre.find("code")
193 if code:
194 # Get the language class if available
195 lang = "ocaml" # Default to OCaml
196 if "language-" in code.get("class", [""]):
197 for cls in code.get("class", []):
198 if cls.startswith("language-"):
199 lang = cls.replace("language-", "")
200
201 # Preserve indentation and line breaks in code blocks
202 code_text = code.get_text()
203 doc_parts.append(f"```{lang}\n{code_text}\n```")
204
205 if doc_parts:
206 doc_content = "\n".join(doc_parts)
207
208 # Only add signatures that have content
209 if sig_type and sig_content:
210 # Extract the name of the element (function name, type name, etc.)
211 name = extract_signature_name(sig_content)
212
213 # Build the full signature
214 signature = {
215 "id": sig_id,
216 "type": sig_type,
217 "name": name,
218 "content": sig_content,
219 "doc": doc_content
220 }
221 signatures.append(signature)
222
223 return signatures
224
225
226def generate_markdown(module_info, signatures):
227 """Generate markdown documentation from parsed module information."""
228 md_lines = []
229
230 # Module header with breadcrumbs
231 breadcrumb_path = " > ".join(module_info["breadcrumbs"])
232 md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
233 md_lines.append(f"**Path:** {breadcrumb_path}")
234 md_lines.append("")
235
236 # Add module preamble documentation if available
237 if module_info["preamble"]:
238 preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
239 preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
240 if preamble_text:
241 md_lines.append(preamble_text)
242 md_lines.append("")
243
244 # Organize signatures by type
245 sig_by_type = defaultdict(list)
246 for sig in signatures:
247 sig_by_type[sig["type"]].append(sig)
248
249 # Process types first
250 if "type" in sig_by_type:
251 md_lines.append("## Types")
252 for sig in sig_by_type["type"]:
253 md_lines.append("")
254 md_lines.append(f"### `{sig['content']}`")
255
256 # Add documentation if available
257 if sig["doc"]:
258 md_lines.append("")
259 md_lines.append(sig["doc"])
260 md_lines.append("")
261
262 # Process exceptions
263 if "exception" in sig_by_type:
264 md_lines.append("## Exceptions")
265 for sig in sig_by_type["exception"]:
266 md_lines.append("")
267 md_lines.append(f"### `{sig['content']}`")
268
269 # Add documentation if available
270 if sig["doc"]:
271 md_lines.append("")
272 md_lines.append(sig["doc"])
273 md_lines.append("")
274
275 # Process values (functions)
276 if "val" in sig_by_type:
277 md_lines.append("## Values")
278 for sig in sig_by_type["val"]:
279 md_lines.append("")
280 md_lines.append(f"### `{sig['content']}`")
281
282 # Add documentation if available
283 if sig["doc"]:
284 md_lines.append("")
285 md_lines.append(sig["doc"])
286 md_lines.append("")
287
288 # Process modules
289 if "module" in sig_by_type:
290 md_lines.append("## Modules")
291 for sig in sig_by_type["module"]:
292 md_lines.append("")
293 md_lines.append(f"### `{sig['content']}`")
294
295 # Add documentation if available
296 if sig["doc"]:
297 md_lines.append("")
298 md_lines.append(sig["doc"])
299 md_lines.append("")
300
301 # Process classes
302 if "class" in sig_by_type:
303 md_lines.append("## Classes")
304 for sig in sig_by_type["class"]:
305 md_lines.append("")
306 md_lines.append(f"### `{sig['content']}`")
307
308 # Add documentation if available
309 if sig["doc"]:
310 md_lines.append("")
311 md_lines.append(sig["doc"])
312 md_lines.append("")
313
314 # Process remaining signature types
315 for sig_type, sigs in sig_by_type.items():
316 if sig_type not in ["type", "val", "module", "class", "exception"]:
317 md_lines.append(f"## {sig_type.capitalize()}s")
318 for sig in sigs:
319 md_lines.append("")
320 md_lines.append(f"### `{sig['content']}`")
321
322 # Add documentation if available
323 if sig["doc"]:
324 md_lines.append("")
325 md_lines.append(sig["doc"])
326 md_lines.append("")
327
328 return "\n".join(md_lines)
329
330
331def build_module_hierarchy(json_files, root_dir):
332 """Build a hierarchical structure from all the JSON files."""
333 hierarchy = defaultdict(list)
334
335 for json_file in json_files:
336 rel_path = os.path.relpath(json_file, root_dir)
337 package_parts = rel_path.split(os.sep)
338
339 # Skip irrelevant JSON files
340 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
341 # For index.html.json, check if it's a module documentation
342 if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
343 try:
344 with open(json_file, 'r', encoding='utf-8') as f:
345 json_content = f.read()
346
347 # Try to parse the module info
348 module_info = extract_module_info(json_content)
349 signatures = parse_module_signature(module_info["content"])
350
351 # Group by package/library
352 if len(package_parts) > 1:
353 package_name = package_parts[0]
354 hierarchy[package_name].append({
355 "file": json_file,
356 "module_info": module_info,
357 "signatures": signatures,
358 "path_parts": package_parts
359 })
360 except Exception as e:
361 print(f"Error processing {json_file}: {e}", file=sys.stderr)
362
363 continue
364
365 # Try to parse other JSON files (non-index.html.json)
366 try:
367 with open(json_file, 'r', encoding='utf-8') as f:
368 json_content = f.read()
369
370 module_info = extract_module_info(json_content)
371 signatures = parse_module_signature(module_info["content"])
372
373 # Group by package/library
374 if len(package_parts) > 1:
375 package_name = package_parts[0]
376 hierarchy[package_name].append({
377 "file": json_file,
378 "module_info": module_info,
379 "signatures": signatures,
380 "path_parts": package_parts
381 })
382 except Exception as e:
383 print(f"Error processing {json_file}: {e}", file=sys.stderr)
384
385 return hierarchy
386
387
388def sort_modules_hierarchically(modules):
389 """Sort modules to ensure proper hierarchical presentation."""
390 # First sort by breadcrumb length (shorter = higher in hierarchy)
391 # Then sort alphabetically within the same level
392 return sorted(modules, key=lambda x: (
393 len(x["module_info"]["breadcrumbs"]),
394 x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
395 ))
396
397
398def generate_markdown_library(lib_name, modules):
399 """Generate markdown for a specific library."""
400 md_lines = []
401
402 md_lines.append(f"# Library: {lib_name}")
403 md_lines.append("")
404
405 # Sort modules hierarchically
406 sorted_modules = sort_modules_hierarchically(modules)
407
408 for module in sorted_modules:
409 module_md = generate_markdown(module["module_info"], module["signatures"])
410 md_lines.append(module_md)
411 md_lines.append("\n---\n")
412
413 return "\n".join(md_lines)
414
415
416def main():
417 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
418 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
419 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
420 parser.add_argument('--package', '-p', help='Focus on a specific package/library')
421 parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
422 args = parser.parse_args()
423
424 html_dir = Path(args.html_dir)
425
426 if not html_dir.exists() or not html_dir.is_dir():
427 print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
428 sys.exit(1)
429
430 # Find all JSON files
431 json_files = []
432 for root, _, files in os.walk(html_dir):
433 for file in files:
434 if file.endswith('.html.json'):
435 json_files.append(os.path.join(root, file))
436
437 if args.verbose:
438 print(f"Found {len(json_files)} JSON files", file=sys.stderr)
439
440 # Build module hierarchy
441 hierarchy = build_module_hierarchy(json_files, html_dir)
442
443 if args.verbose:
444 print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
445 for lib, modules in hierarchy.items():
446 print(f" - {lib}: {len(modules)} modules", file=sys.stderr)
447
448 # Generate markdown for all or specific package
449 if args.package and args.package in hierarchy:
450 markdown = generate_markdown_library(args.package, hierarchy[args.package])
451 else:
452 # Combine all packages
453 markdown_parts = []
454 for lib_name, modules in sorted(hierarchy.items()):
455 if args.verbose:
456 print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
457 lib_md = generate_markdown_library(lib_name, modules)
458 markdown_parts.append(lib_md)
459 markdown_parts.append("\n\n")
460
461 markdown = "\n".join(markdown_parts)
462
463 # Write markdown to output file
464 with open(args.output, 'w', encoding='utf-8') as f:
465 f.write(markdown)
466
467 print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
468
469
470if __name__ == "__main__":
471 main()