Take OCaml odoc output into MCP
1#!/usr/bin/env python3 2# /// script 3# requires-python = ">=3.11" 4# dependencies = [ 5# "bs4", 6# ] 7# /// 8""" 9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs 10 11This script processes JSON files generated by odoc-driver (OCaml documentation generator) 12and produces a single Markdown file with the essential module structure and signatures 13formatted in a way that makes it useful for LLMs to reason about OCaml codebases. 14""" 15 16import os 17import sys 18import json 19import re 20from bs4 import BeautifulSoup 21from collections import defaultdict 22import argparse 23from pathlib import Path 24import html 25 26 27def extract_module_info(json_content): 28 """Extract module information from odoc JSON content.""" 29 data = json.loads(json_content) 30 31 # Extract module name and type from header 32 header = data.get("header", "") 33 soup = BeautifulSoup(header, "html.parser") 34 header_text = soup.get_text().strip() 35 36 # Determine module type and name 37 module_type = "Module" 38 if "Module type" in header_text: 39 module_type = "Module type" 40 elif "Class" in header_text: 41 module_type = "Class" 42 43 # Extract the actual module name 44 module_name = "" 45 code_tag = soup.find("code") 46 if code_tag: 47 module_name = code_tag.get_text().strip() 48 else: 49 # Fall back to header text with type prefix removed 50 module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text) 51 52 # Extract breadcrumbs for context 53 breadcrumbs = [] 54 for crumb in data.get("breadcrumbs", []): 55 name = crumb.get("name", "") 56 if name: 57 soup = BeautifulSoup(name, "html.parser") 58 clean_name = soup.get_text().strip() 59 # Clean up the breadcrumb text 60 clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name) 61 breadcrumbs.append(clean_name) 62 63 # Extract module content 64 content = data.get("content", "") 65 soup = BeautifulSoup(content, "html.parser") 66 67 return { 68 "name": module_name, 69 "type": module_type, 70 "breadcrumbs": breadcrumbs, 71 "content": soup, 72 "preamble": data.get("preamble", "") 73 } 74 75 76def clean_signature_text(text): 77 """Clean up signature text for better readability.""" 78 # Replace special arrow characters with -> 79 text = text.replace('', '').replace('', '-').replace('', '-').replace('', '->') 80 81 # Replace multiple spaces with a single space, except in code blocks 82 text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text) 83 84 return text 85 86 87def extract_signature_name(sig_content): 88 """Extract the name of a signature (function name, type name, etc.).""" 89 # For val signatures: extract function name before the first : 90 match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content) 91 if match: 92 return match.group(1) 93 94 # For type signatures: extract type name 95 match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content) 96 if match: 97 return match.group(1) 98 99 # For module signatures: extract module name 100 match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content) 101 if match: 102 return match.group(1) 103 104 # For class signatures: extract class name 105 match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content) 106 if match: 107 return match.group(1) 108 109 # For exception signatures: extract exception name 110 match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content) 111 if match: 112 return match.group(1) 113 114 return None 115 116 117def parse_module_signature(content_soup): 118 """Parse the OCaml module signature from the HTML content.""" 119 signatures = [] 120 121 # Get all the odoc-spec divs 122 spec_divs = content_soup.find_all("div", class_="odoc-spec") 123 124 for spec in spec_divs: 125 sig_id = None 126 sig_type = None 127 sig_content = None 128 doc_content = None 129 130 # Find the actual signature 131 sig_div = spec.find("div", class_="spec") 132 if sig_div: 133 # Get the ID for cross-referencing 134 sig_id = sig_div.get("id", "") 135 136 # Determine the type of signature (type, val, module, etc.) 137 sig_type_span = sig_div.find("span", class_="keyword") 138 if sig_type_span: 139 sig_type = sig_type_span.get_text().strip() 140 141 # Get the full code content 142 code_tag = sig_div.find("code") 143 if code_tag: 144 # Extract the full OCaml signature text properly 145 # We'll convert all spans to plain text while preserving structure 146 for span in code_tag.find_all("span"): 147 span.replace_with(span.get_text()) 148 149 sig_content = clean_signature_text(code_tag.get_text()) 150 151 # Find documentation for this signature 152 doc_div = spec.find("div", class_="spec-doc") 153 if doc_div: 154 # Process paragraphs and lists for documentation 155 doc_parts = [] 156 157 # Process regular paragraphs 158 for p in doc_div.find_all("p"): 159 # Clean up code references in paragraph 160 for code in p.find_all("code"): 161 # Convert links within code tags to plain text 162 for a in code.find_all("a"): 163 a.replace_with(a.get_text()) 164 # Keep the code tag formatting 165 code_text = code.get_text() 166 code.string = code_text 167 168 # Clean up the paragraph text 169 p_text = clean_signature_text(p.get_text()).strip() 170 if p_text: 171 doc_parts.append(p_text) 172 173 # Process bulleted lists 174 for ul in doc_div.find_all("ul"): 175 for li in ul.find_all("li"): 176 # Check if it's a special tag like @raises, @returns, etc. 177 tag_span = li.find("span", class_="at-tag") 178 if tag_span: 179 tag_name = tag_span.get_text().strip() 180 # Remove the tag span from consideration 181 tag_span.extract() 182 # Get the rest of the content 183 li_text = clean_signature_text(li.get_text()).strip() 184 doc_parts.append(f"@{tag_name} {li_text}") 185 else: 186 # Regular list item 187 li_text = clean_signature_text(li.get_text()).strip() 188 doc_parts.append(f"- {li_text}") 189 190 # Process code examples 191 for pre in doc_div.find_all("pre"): 192 code = pre.find("code") 193 if code: 194 # Get the language class if available 195 lang = "ocaml" # Default to OCaml 196 if "language-" in code.get("class", [""]): 197 for cls in code.get("class", []): 198 if cls.startswith("language-"): 199 lang = cls.replace("language-", "") 200 201 # Preserve indentation and line breaks in code blocks 202 code_text = code.get_text() 203 doc_parts.append(f"```{lang}\n{code_text}\n```") 204 205 if doc_parts: 206 doc_content = "\n".join(doc_parts) 207 208 # Only add signatures that have content 209 if sig_type and sig_content: 210 # Extract the name of the element (function name, type name, etc.) 211 name = extract_signature_name(sig_content) 212 213 # Build the full signature 214 signature = { 215 "id": sig_id, 216 "type": sig_type, 217 "name": name, 218 "content": sig_content, 219 "doc": doc_content 220 } 221 signatures.append(signature) 222 223 return signatures 224 225 226def generate_markdown(module_info, signatures): 227 """Generate markdown documentation from parsed module information.""" 228 md_lines = [] 229 230 # Module header with breadcrumbs 231 breadcrumb_path = " > ".join(module_info["breadcrumbs"]) 232 md_lines.append(f"# {module_info['type']} `{module_info['name']}`") 233 md_lines.append(f"**Path:** {breadcrumb_path}") 234 md_lines.append("") 235 236 # Add module preamble documentation if available 237 if module_info["preamble"]: 238 preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser") 239 preamble_text = clean_signature_text(preamble_soup.get_text()).strip() 240 if preamble_text: 241 md_lines.append(preamble_text) 242 md_lines.append("") 243 244 # Organize signatures by type 245 sig_by_type = defaultdict(list) 246 for sig in signatures: 247 sig_by_type[sig["type"]].append(sig) 248 249 # Process types first 250 if "type" in sig_by_type: 251 md_lines.append("## Types") 252 for sig in sig_by_type["type"]: 253 md_lines.append("") 254 md_lines.append(f"### `{sig['content']}`") 255 256 # Add documentation if available 257 if sig["doc"]: 258 md_lines.append("") 259 md_lines.append(sig["doc"]) 260 md_lines.append("") 261 262 # Process exceptions 263 if "exception" in sig_by_type: 264 md_lines.append("## Exceptions") 265 for sig in sig_by_type["exception"]: 266 md_lines.append("") 267 md_lines.append(f"### `{sig['content']}`") 268 269 # Add documentation if available 270 if sig["doc"]: 271 md_lines.append("") 272 md_lines.append(sig["doc"]) 273 md_lines.append("") 274 275 # Process values (functions) 276 if "val" in sig_by_type: 277 md_lines.append("## Values") 278 for sig in sig_by_type["val"]: 279 md_lines.append("") 280 md_lines.append(f"### `{sig['content']}`") 281 282 # Add documentation if available 283 if sig["doc"]: 284 md_lines.append("") 285 md_lines.append(sig["doc"]) 286 md_lines.append("") 287 288 # Process modules 289 if "module" in sig_by_type: 290 md_lines.append("## Modules") 291 for sig in sig_by_type["module"]: 292 md_lines.append("") 293 md_lines.append(f"### `{sig['content']}`") 294 295 # Add documentation if available 296 if sig["doc"]: 297 md_lines.append("") 298 md_lines.append(sig["doc"]) 299 md_lines.append("") 300 301 # Process classes 302 if "class" in sig_by_type: 303 md_lines.append("## Classes") 304 for sig in sig_by_type["class"]: 305 md_lines.append("") 306 md_lines.append(f"### `{sig['content']}`") 307 308 # Add documentation if available 309 if sig["doc"]: 310 md_lines.append("") 311 md_lines.append(sig["doc"]) 312 md_lines.append("") 313 314 # Process remaining signature types 315 for sig_type, sigs in sig_by_type.items(): 316 if sig_type not in ["type", "val", "module", "class", "exception"]: 317 md_lines.append(f"## {sig_type.capitalize()}s") 318 for sig in sigs: 319 md_lines.append("") 320 md_lines.append(f"### `{sig['content']}`") 321 322 # Add documentation if available 323 if sig["doc"]: 324 md_lines.append("") 325 md_lines.append(sig["doc"]) 326 md_lines.append("") 327 328 return "\n".join(md_lines) 329 330 331def build_module_hierarchy(json_files, root_dir): 332 """Build a hierarchical structure from all the JSON files.""" 333 hierarchy = defaultdict(list) 334 335 for json_file in json_files: 336 rel_path = os.path.relpath(json_file, root_dir) 337 package_parts = rel_path.split(os.sep) 338 339 # Skip irrelevant JSON files 340 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]: 341 # For index.html.json, check if it's a module documentation 342 if package_parts[-1] == "index.html.json" and len(package_parts) > 1: 343 try: 344 with open(json_file, 'r', encoding='utf-8') as f: 345 json_content = f.read() 346 347 # Try to parse the module info 348 module_info = extract_module_info(json_content) 349 signatures = parse_module_signature(module_info["content"]) 350 351 # Group by package/library 352 if len(package_parts) > 1: 353 package_name = package_parts[0] 354 hierarchy[package_name].append({ 355 "file": json_file, 356 "module_info": module_info, 357 "signatures": signatures, 358 "path_parts": package_parts 359 }) 360 except Exception as e: 361 print(f"Error processing {json_file}: {e}", file=sys.stderr) 362 363 continue 364 365 # Try to parse other JSON files (non-index.html.json) 366 try: 367 with open(json_file, 'r', encoding='utf-8') as f: 368 json_content = f.read() 369 370 module_info = extract_module_info(json_content) 371 signatures = parse_module_signature(module_info["content"]) 372 373 # Group by package/library 374 if len(package_parts) > 1: 375 package_name = package_parts[0] 376 hierarchy[package_name].append({ 377 "file": json_file, 378 "module_info": module_info, 379 "signatures": signatures, 380 "path_parts": package_parts 381 }) 382 except Exception as e: 383 print(f"Error processing {json_file}: {e}", file=sys.stderr) 384 385 return hierarchy 386 387 388def sort_modules_hierarchically(modules): 389 """Sort modules to ensure proper hierarchical presentation.""" 390 # First sort by breadcrumb length (shorter = higher in hierarchy) 391 # Then sort alphabetically within the same level 392 return sorted(modules, key=lambda x: ( 393 len(x["module_info"]["breadcrumbs"]), 394 x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else "" 395 )) 396 397 398def generate_markdown_library(lib_name, modules): 399 """Generate markdown for a specific library.""" 400 md_lines = [] 401 402 md_lines.append(f"# Library: {lib_name}") 403 md_lines.append("") 404 405 # Sort modules hierarchically 406 sorted_modules = sort_modules_hierarchically(modules) 407 408 for module in sorted_modules: 409 module_md = generate_markdown(module["module_info"], module["signatures"]) 410 md_lines.append(module_md) 411 md_lines.append("\n---\n") 412 413 return "\n".join(md_lines) 414 415 416def main(): 417 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.') 418 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files') 419 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file') 420 parser.add_argument('--package', '-p', help='Focus on a specific package/library') 421 parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') 422 args = parser.parse_args() 423 424 html_dir = Path(args.html_dir) 425 426 if not html_dir.exists() or not html_dir.is_dir(): 427 print(f"Error: {html_dir} is not a valid directory", file=sys.stderr) 428 sys.exit(1) 429 430 # Find all JSON files 431 json_files = [] 432 for root, _, files in os.walk(html_dir): 433 for file in files: 434 if file.endswith('.html.json'): 435 json_files.append(os.path.join(root, file)) 436 437 if args.verbose: 438 print(f"Found {len(json_files)} JSON files", file=sys.stderr) 439 440 # Build module hierarchy 441 hierarchy = build_module_hierarchy(json_files, html_dir) 442 443 if args.verbose: 444 print(f"Processed {len(hierarchy)} libraries", file=sys.stderr) 445 for lib, modules in hierarchy.items(): 446 print(f" - {lib}: {len(modules)} modules", file=sys.stderr) 447 448 # Generate markdown for all or specific package 449 if args.package and args.package in hierarchy: 450 markdown = generate_markdown_library(args.package, hierarchy[args.package]) 451 else: 452 # Combine all packages 453 markdown_parts = [] 454 for lib_name, modules in sorted(hierarchy.items()): 455 if args.verbose: 456 print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr) 457 lib_md = generate_markdown_library(lib_name, modules) 458 markdown_parts.append(lib_md) 459 markdown_parts.append("\n\n") 460 461 markdown = "\n".join(markdown_parts) 462 463 # Write markdown to output file 464 with open(args.output, 'w', encoding='utf-8') as f: 465 f.write(markdown) 466 467 print(f"Generated Markdown documentation in {args.output}", file=sys.stderr) 468 469 470if __name__ == "__main__": 471 main()