Take OCaml odoc output into MCP
1#!/usr/bin/env python3
2# /// script
3# requires-python = ">=3.11"
4# dependencies = [
5# "bs4",
6# ]
7# ///
8"""
9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
10
11This script processes JSON files generated by odoc-driver (OCaml documentation generator)
12and produces a single Markdown file with the essential module structure and signatures
13formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
14"""
15
16import os
17import sys
18import json
19import re
20from bs4 import BeautifulSoup
21from collections import defaultdict
22import argparse
23from pathlib import Path
24import html
25
26
27def extract_module_info(json_content):
28 """Extract module information from odoc JSON content."""
29 try:
30 data = json.loads(json_content)
31 except json.JSONDecodeError as e:
32 print(f"JSON decode error: {e}")
33 # Return a minimal structure that won't cause errors downstream
34 return {
35 "name": "Unknown",
36 "type": "Module",
37 "breadcrumbs": [],
38 "content": BeautifulSoup("", "html.parser"),
39 "preamble": ""
40 }
41
42 # Extract module name and type from header
43 header = data.get("header", "")
44 soup = BeautifulSoup(header, "html.parser")
45 header_text = soup.get_text().strip()
46
47 # Determine module type and name
48 module_type = "Module"
49 if "Module type" in header_text:
50 module_type = "Module type"
51 elif "Class" in header_text:
52 module_type = "Class"
53
54 # Extract the actual module name
55 module_name = ""
56 code_tag = soup.find("code")
57 if code_tag:
58 module_name = code_tag.get_text().strip()
59 else:
60 # Fall back to header text with type prefix removed
61 module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
62
63 # Extract breadcrumbs for context
64 breadcrumbs = []
65 for crumb in data.get("breadcrumbs", []):
66 name = crumb.get("name", "")
67 if name:
68 soup = BeautifulSoup(name, "html.parser")
69 clean_name = soup.get_text().strip()
70 # Clean up the breadcrumb text
71 clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
72 breadcrumbs.append(clean_name)
73
74 # Extract module content
75 content = data.get("content", "")
76 soup = BeautifulSoup(content, "html.parser")
77
78 return {
79 "name": module_name,
80 "type": module_type,
81 "breadcrumbs": breadcrumbs,
82 "content": soup,
83 "preamble": data.get("preamble", "")
84 }
85
86
87def clean_signature_text(text):
88 """Clean up signature text for better readability."""
89 # Replace special arrow characters with ->
90 text = text.replace('', '').replace('−', '-').replace('‑', '-').replace('→', '->')
91
92 # Replace multiple spaces with a single space, except in code blocks
93 text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
94
95 return text
96
97
98def extract_signature_name(sig_content):
99 """Extract the name of a signature (function name, type name, etc.)."""
100 # For val signatures: extract function name before the first :
101 match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
102 if match:
103 return match.group(1)
104
105 # For type signatures: extract type name
106 match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
107 if match:
108 return match.group(1)
109
110 # For module signatures: extract module name
111 match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
112 if match:
113 return match.group(1)
114
115 # For class signatures: extract class name
116 match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
117 if match:
118 return match.group(1)
119
120 # For exception signatures: extract exception name
121 match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
122 if match:
123 return match.group(1)
124
125 return None
126
127
128def parse_module_signature(content_soup):
129 """Parse the OCaml module signature from the HTML content."""
130 signatures = []
131
132 # Get all the odoc-spec divs
133 spec_divs = content_soup.find_all("div", class_="odoc-spec")
134
135 for spec in spec_divs:
136 sig_id = None
137 sig_type = None
138 sig_content = None
139 doc_content = None
140
141 # Find the actual signature
142 sig_div = spec.find("div", class_="spec")
143 if sig_div:
144 # Get the ID for cross-referencing
145 sig_id = sig_div.get("id", "")
146
147 # Determine the type of signature (type, val, module, etc.)
148 sig_type_span = sig_div.find("span", class_="keyword")
149 if sig_type_span:
150 sig_type = sig_type_span.get_text().strip()
151
152 # Get the full code content
153 code_tag = sig_div.find("code")
154 if code_tag:
155 # Extract the full OCaml signature text properly
156 # We'll convert all spans to plain text while preserving structure
157 for span in code_tag.find_all("span"):
158 span.replace_with(span.get_text())
159
160 sig_content = clean_signature_text(code_tag.get_text())
161
162 # Find documentation for this signature
163 doc_div = spec.find("div", class_="spec-doc")
164 if doc_div:
165 # Process paragraphs and lists for documentation
166 doc_parts = []
167
168 # Process regular paragraphs
169 for p in doc_div.find_all("p"):
170 # Clean up code references in paragraph
171 for code in p.find_all("code"):
172 # Convert links within code tags to plain text
173 for a in code.find_all("a"):
174 a.replace_with(a.get_text())
175 # Keep the code tag formatting
176 code_text = code.get_text()
177 code.string = code_text
178
179 # Clean up the paragraph text
180 p_text = clean_signature_text(p.get_text()).strip()
181 if p_text:
182 doc_parts.append(p_text)
183
184 # Process bulleted lists
185 for ul in doc_div.find_all("ul"):
186 for li in ul.find_all("li"):
187 # Check if it's a special tag like @raises, @returns, etc.
188 tag_span = li.find("span", class_="at-tag")
189 if tag_span:
190 tag_name = tag_span.get_text().strip()
191 # Remove the tag span from consideration
192 tag_span.extract()
193 # Get the rest of the content
194 li_text = clean_signature_text(li.get_text()).strip()
195 doc_parts.append(f"@{tag_name} {li_text}")
196 else:
197 # Regular list item
198 li_text = clean_signature_text(li.get_text()).strip()
199 doc_parts.append(f"- {li_text}")
200
201 # Process code examples
202 for pre in doc_div.find_all("pre"):
203 code = pre.find("code")
204 if code:
205 # Get the language class if available
206 lang = "ocaml" # Default to OCaml
207 if "language-" in code.get("class", [""]):
208 for cls in code.get("class", []):
209 if cls.startswith("language-"):
210 lang = cls.replace("language-", "")
211
212 # Preserve indentation and line breaks in code blocks
213 code_text = code.get_text()
214 doc_parts.append(f"```{lang}\n{code_text}\n```")
215
216 if doc_parts:
217 doc_content = "\n".join(doc_parts)
218
219 # Only add signatures that have content
220 if sig_type and sig_content:
221 # Extract the name of the element (function name, type name, etc.)
222 name = extract_signature_name(sig_content)
223
224 # Build the full signature
225 signature = {
226 "id": sig_id,
227 "type": sig_type,
228 "name": name,
229 "content": sig_content,
230 "doc": doc_content
231 }
232 signatures.append(signature)
233
234 return signatures
235
236
237def generate_markdown(module_info, signatures):
238 """Generate markdown documentation from parsed module information."""
239 md_lines = []
240
241 # Module header with breadcrumbs
242 breadcrumb_path = " > ".join(module_info["breadcrumbs"])
243 md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
244 md_lines.append(f"**Path:** {breadcrumb_path}")
245 md_lines.append("")
246
247 # Add module preamble documentation if available
248 if module_info["preamble"]:
249 preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
250 preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
251 if preamble_text:
252 md_lines.append(preamble_text)
253 md_lines.append("")
254
255 # Organize signatures by type
256 sig_by_type = defaultdict(list)
257 for sig in signatures:
258 sig_by_type[sig["type"]].append(sig)
259
260 # Process types first
261 if "type" in sig_by_type:
262 md_lines.append("## Types")
263 for sig in sig_by_type["type"]:
264 md_lines.append("")
265 md_lines.append(f"### `{sig['content']}`")
266
267 # Add documentation if available
268 if sig["doc"]:
269 md_lines.append("")
270 md_lines.append(sig["doc"])
271 md_lines.append("")
272
273 # Process exceptions
274 if "exception" in sig_by_type:
275 md_lines.append("## Exceptions")
276 for sig in sig_by_type["exception"]:
277 md_lines.append("")
278 md_lines.append(f"### `{sig['content']}`")
279
280 # Add documentation if available
281 if sig["doc"]:
282 md_lines.append("")
283 md_lines.append(sig["doc"])
284 md_lines.append("")
285
286 # Process values (functions)
287 if "val" in sig_by_type:
288 md_lines.append("## Values")
289 for sig in sig_by_type["val"]:
290 md_lines.append("")
291 md_lines.append(f"### `{sig['content']}`")
292
293 # Add documentation if available
294 if sig["doc"]:
295 md_lines.append("")
296 md_lines.append(sig["doc"])
297 md_lines.append("")
298
299 # Process modules
300 if "module" in sig_by_type:
301 md_lines.append("## Modules")
302 for sig in sig_by_type["module"]:
303 md_lines.append("")
304 md_lines.append(f"### `{sig['content']}`")
305
306 # Add documentation if available
307 if sig["doc"]:
308 md_lines.append("")
309 md_lines.append(sig["doc"])
310 md_lines.append("")
311
312 # Process classes
313 if "class" in sig_by_type:
314 md_lines.append("## Classes")
315 for sig in sig_by_type["class"]:
316 md_lines.append("")
317 md_lines.append(f"### `{sig['content']}`")
318
319 # Add documentation if available
320 if sig["doc"]:
321 md_lines.append("")
322 md_lines.append(sig["doc"])
323 md_lines.append("")
324
325 # Process remaining signature types
326 for sig_type, sigs in sig_by_type.items():
327 if sig_type not in ["type", "val", "module", "class", "exception"]:
328 md_lines.append(f"## {sig_type.capitalize()}s")
329 for sig in sigs:
330 md_lines.append("")
331 md_lines.append(f"### `{sig['content']}`")
332
333 # Add documentation if available
334 if sig["doc"]:
335 md_lines.append("")
336 md_lines.append(sig["doc"])
337 md_lines.append("")
338
339 return "\n".join(md_lines)
340
341
342def read_json_file(file_path):
343 """
344 Read a JSON file with robust error handling for encoding issues.
345
346 Args:
347 file_path: Path to the JSON file
348
349 Returns:
350 Content of the JSON file as a string, or None if there was an error
351 """
352 # Try UTF-8 first (most common encoding)
353 try:
354 with open(file_path, 'r', encoding='utf-8') as f:
355 return f.read()
356 except UnicodeDecodeError:
357 # Try other encodings if UTF-8 fails
358 try:
359 with open(file_path, 'r', encoding='latin-1') as f:
360 return f.read()
361 except Exception as e:
362 print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)
363 return None
364
365
366def build_module_hierarchy(json_files, root_dir):
367 """Build a hierarchical structure from all the JSON files."""
368 hierarchy = defaultdict(list)
369
370 for json_file in json_files:
371 rel_path = os.path.relpath(json_file, root_dir)
372 package_parts = rel_path.split(os.sep)
373
374 # Skip irrelevant JSON files
375 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
376 # For index.html.json, check if it's a module documentation
377 if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
378 json_content = read_json_file(json_file)
379 if json_content:
380 try:
381 # Try to parse the module info
382 module_info = extract_module_info(json_content)
383 signatures = parse_module_signature(module_info["content"])
384
385 # Determine package name and version from path
386 package_name, package_version = determine_package_info(json_file, package_parts, module_info)
387
388 # Use package name and version for the hierarchy key
389 package_key = f"{package_name}"
390 if package_version != "unknown":
391 # Add version information to module_info for display in markdown
392 module_info["package_version"] = package_version
393
394 hierarchy[package_key].append({
395 "file": json_file,
396 "module_info": module_info,
397 "signatures": signatures,
398 "path_parts": package_parts
399 })
400 except Exception as e:
401 print(f"Error processing {json_file}: {e}", file=sys.stderr)
402
403 continue
404
405 # Try to parse other JSON files (non-index.html.json)
406 json_content = read_json_file(json_file)
407 if json_content:
408 try:
409 module_info = extract_module_info(json_content)
410 signatures = parse_module_signature(module_info["content"])
411
412 # Determine package name from path
413 package_name = determine_package_name(package_parts, module_info)
414
415 hierarchy[package_name].append({
416 "file": json_file,
417 "module_info": module_info,
418 "signatures": signatures,
419 "path_parts": package_parts
420 })
421 except Exception as e:
422 print(f"Error processing {json_file}: {e}", file=sys.stderr)
423
424 return hierarchy
425
426
427def determine_package_info(file_path, path_parts, module_info):
428 """
429 Determine package name and version from file path and module info.
430
431 Args:
432 file_path: The full file path
433 path_parts: Parts of the path
434 module_info: Extracted module information
435
436 Returns:
437 Tuple of (package_name, package_version)
438 """
439 package_name = "unknown"
440 package_version = "unknown"
441
442 # Try to extract from breadcrumbs if available
443 if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):
444 for crumb in module_info["breadcrumbs"]:
445 if "Library" in crumb:
446 # Extract library name from the breadcrumb
447 match = re.search(r'Library\s+(.+)', crumb)
448 if match:
449 package_name = match.group(1).strip()
450
451 # Look for test/package-name/version pattern in the path
452 file_path_parts = Path(file_path).resolve().parts
453 for i, part in enumerate(file_path_parts):
454 if part == "test" and i + 2 < len(file_path_parts):
455 # We found a test directory, extract package name and version
456 package_name = file_path_parts[i + 1]
457 package_version = file_path_parts[i + 2]
458 break
459
460 # If still unknown, fall back to using the first part of the path
461 if package_name == "unknown" and len(path_parts) > 0:
462 package_name = path_parts[0]
463
464 # Last resort - use module name or "unknown"
465 if package_name == "unknown":
466 package_name = module_info["name"] if module_info["name"] else "unknown"
467
468 return package_name, package_version
469
470
471def sort_modules_hierarchically(modules):
472 """Sort modules to ensure proper hierarchical presentation."""
473 # First sort by breadcrumb length (shorter = higher in hierarchy)
474 # Then sort alphabetically within the same level
475 return sorted(modules, key=lambda x: (
476 len(x["module_info"]["breadcrumbs"]),
477 x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
478 ))
479
480
481def generate_markdown_library(lib_name, modules):
482 """Generate markdown for a specific library."""
483 md_lines = []
484
485 md_lines.append(f"# Library: {lib_name}")
486 md_lines.append("")
487
488 # Sort modules hierarchically
489 sorted_modules = sort_modules_hierarchically(modules)
490
491 for module in sorted_modules:
492 module_md = generate_markdown(module["module_info"], module["signatures"])
493 md_lines.append(module_md)
494 md_lines.append("\n---\n")
495
496 return "\n".join(md_lines)
497
498
499def main():
500 """
501 Main entry point for the script.
502
503 Usage examples:
504
505 # Process all packages in a directory
506 python odoc2llm.py /path/to/odoc/output
507
508 # Process all packages and specify output file
509 python odoc2llm.py /path/to/odoc/output --output documentation.md
510
511 # Process a specific package only
512 python odoc2llm.py /path/to/odoc/output --package package-name
513
514 # Enable verbose output
515 python odoc2llm.py /path/to/odoc/output --verbose
516 """
517 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
518 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
519 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
520 parser.add_argument('--package', '-p', help='Focus on a specific package/library')
521 parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
522 args = parser.parse_args()
523
524 html_dir = Path(args.html_dir)
525
526 if not html_dir.exists() or not html_dir.is_dir():
527 print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
528 sys.exit(1)
529
530 # Find all JSON files
531 json_files = []
532 for root, _, files in os.walk(html_dir):
533 for file in files:
534 if file.endswith('.html.json'):
535 json_files.append(os.path.join(root, file))
536
537 if args.verbose:
538 print(f"Found {len(json_files)} JSON files", file=sys.stderr)
539
540 # Build module hierarchy
541 hierarchy = build_module_hierarchy(json_files, html_dir)
542
543 if args.verbose:
544 print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
545 for lib, modules in hierarchy.items():
546 print(f" - {lib}: {len(modules)} modules", file=sys.stderr)
547
548 # Generate markdown for all or specific package
549 if args.package and args.package in hierarchy:
550 markdown = generate_markdown_library(args.package, hierarchy[args.package])
551 else:
552 # Combine all packages
553 markdown_parts = []
554 for lib_name, modules in sorted(hierarchy.items()):
555 if args.verbose:
556 print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
557 lib_md = generate_markdown_library(lib_name, modules)
558 markdown_parts.append(lib_md)
559 markdown_parts.append("\n\n")
560
561 markdown = "\n".join(markdown_parts)
562
563 # Write markdown to output file
564 with open(args.output, 'w', encoding='utf-8') as f:
565 f.write(markdown)
566
567 print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
568
569
570if __name__ == "__main__":
571 main()