Take OCaml odoc output into MCP
at main 21 kB view raw
1#!/usr/bin/env python3 2# /// script 3# requires-python = ">=3.11" 4# dependencies = [ 5# "bs4", 6# ] 7# /// 8""" 9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs 10 11This script processes JSON files generated by odoc-driver (OCaml documentation generator) 12and produces a single Markdown file with the essential module structure and signatures 13formatted in a way that makes it useful for LLMs to reason about OCaml codebases. 14""" 15 16import os 17import sys 18import json 19import re 20from bs4 import BeautifulSoup 21from collections import defaultdict 22import argparse 23from pathlib import Path 24import html 25 26 27def extract_module_info(json_content): 28 """Extract module information from odoc JSON content.""" 29 try: 30 data = json.loads(json_content) 31 except json.JSONDecodeError as e: 32 print(f"JSON decode error: {e}") 33 # Return a minimal structure that won't cause errors downstream 34 return { 35 "name": "Unknown", 36 "type": "Module", 37 "breadcrumbs": [], 38 "content": BeautifulSoup("", "html.parser"), 39 "preamble": "" 40 } 41 42 # Extract module name and type from header 43 header = data.get("header", "") 44 soup = BeautifulSoup(header, "html.parser") 45 header_text = soup.get_text().strip() 46 47 # Determine module type and name 48 module_type = "Module" 49 if "Module type" in header_text: 50 module_type = "Module type" 51 elif "Class" in header_text: 52 module_type = "Class" 53 54 # Extract the actual module name 55 module_name = "" 56 code_tag = soup.find("code") 57 if code_tag: 58 module_name = code_tag.get_text().strip() 59 else: 60 # Fall back to header text with type prefix removed 61 module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text) 62 63 # Extract breadcrumbs for context 64 breadcrumbs = [] 65 for crumb in data.get("breadcrumbs", []): 66 name = crumb.get("name", "") 67 if name: 68 soup = BeautifulSoup(name, "html.parser") 69 clean_name = soup.get_text().strip() 70 # Clean up the breadcrumb text 71 clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name) 72 breadcrumbs.append(clean_name) 73 74 # Extract module content 75 content = data.get("content", "") 76 soup = BeautifulSoup(content, "html.parser") 77 78 return { 79 "name": module_name, 80 "type": module_type, 81 "breadcrumbs": breadcrumbs, 82 "content": soup, 83 "preamble": data.get("preamble", "") 84 } 85 86 87def clean_signature_text(text): 88 """Clean up signature text for better readability.""" 89 # Replace special arrow characters with -> 90 text = text.replace('', '').replace('', '-').replace('', '-').replace('', '->') 91 92 # Replace multiple spaces with a single space, except in code blocks 93 text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text) 94 95 return text 96 97 98def extract_signature_name(sig_content): 99 """Extract the name of a signature (function name, type name, etc.).""" 100 # For val signatures: extract function name before the first : 101 match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content) 102 if match: 103 return match.group(1) 104 105 # For type signatures: extract type name 106 match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content) 107 if match: 108 return match.group(1) 109 110 # For module signatures: extract module name 111 match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content) 112 if match: 113 return match.group(1) 114 115 # For class signatures: extract class name 116 match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content) 117 if match: 118 return match.group(1) 119 120 # For exception signatures: extract exception name 121 match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content) 122 if match: 123 return match.group(1) 124 125 return None 126 127 128def parse_module_signature(content_soup): 129 """Parse the OCaml module signature from the HTML content.""" 130 signatures = [] 131 132 # Get all the odoc-spec divs 133 spec_divs = content_soup.find_all("div", class_="odoc-spec") 134 135 for spec in spec_divs: 136 sig_id = None 137 sig_type = None 138 sig_content = None 139 doc_content = None 140 141 # Find the actual signature 142 sig_div = spec.find("div", class_="spec") 143 if sig_div: 144 # Get the ID for cross-referencing 145 sig_id = sig_div.get("id", "") 146 147 # Determine the type of signature (type, val, module, etc.) 148 sig_type_span = sig_div.find("span", class_="keyword") 149 if sig_type_span: 150 sig_type = sig_type_span.get_text().strip() 151 152 # Get the full code content 153 code_tag = sig_div.find("code") 154 if code_tag: 155 # Extract the full OCaml signature text properly 156 # We'll convert all spans to plain text while preserving structure 157 for span in code_tag.find_all("span"): 158 span.replace_with(span.get_text()) 159 160 sig_content = clean_signature_text(code_tag.get_text()) 161 162 # Find documentation for this signature 163 doc_div = spec.find("div", class_="spec-doc") 164 if doc_div: 165 # Process paragraphs and lists for documentation 166 doc_parts = [] 167 168 # Process regular paragraphs 169 for p in doc_div.find_all("p"): 170 # Clean up code references in paragraph 171 for code in p.find_all("code"): 172 # Convert links within code tags to plain text 173 for a in code.find_all("a"): 174 a.replace_with(a.get_text()) 175 # Keep the code tag formatting 176 code_text = code.get_text() 177 code.string = code_text 178 179 # Clean up the paragraph text 180 p_text = clean_signature_text(p.get_text()).strip() 181 if p_text: 182 doc_parts.append(p_text) 183 184 # Process bulleted lists 185 for ul in doc_div.find_all("ul"): 186 for li in ul.find_all("li"): 187 # Check if it's a special tag like @raises, @returns, etc. 188 tag_span = li.find("span", class_="at-tag") 189 if tag_span: 190 tag_name = tag_span.get_text().strip() 191 # Remove the tag span from consideration 192 tag_span.extract() 193 # Get the rest of the content 194 li_text = clean_signature_text(li.get_text()).strip() 195 doc_parts.append(f"@{tag_name} {li_text}") 196 else: 197 # Regular list item 198 li_text = clean_signature_text(li.get_text()).strip() 199 doc_parts.append(f"- {li_text}") 200 201 # Process code examples 202 for pre in doc_div.find_all("pre"): 203 code = pre.find("code") 204 if code: 205 # Get the language class if available 206 lang = "ocaml" # Default to OCaml 207 if "language-" in code.get("class", [""]): 208 for cls in code.get("class", []): 209 if cls.startswith("language-"): 210 lang = cls.replace("language-", "") 211 212 # Preserve indentation and line breaks in code blocks 213 code_text = code.get_text() 214 doc_parts.append(f"```{lang}\n{code_text}\n```") 215 216 if doc_parts: 217 doc_content = "\n".join(doc_parts) 218 219 # Only add signatures that have content 220 if sig_type and sig_content: 221 # Extract the name of the element (function name, type name, etc.) 222 name = extract_signature_name(sig_content) 223 224 # Build the full signature 225 signature = { 226 "id": sig_id, 227 "type": sig_type, 228 "name": name, 229 "content": sig_content, 230 "doc": doc_content 231 } 232 signatures.append(signature) 233 234 return signatures 235 236 237def generate_markdown(module_info, signatures): 238 """Generate markdown documentation from parsed module information.""" 239 md_lines = [] 240 241 # Module header with breadcrumbs 242 breadcrumb_path = " > ".join(module_info["breadcrumbs"]) 243 md_lines.append(f"# {module_info['type']} `{module_info['name']}`") 244 md_lines.append(f"**Path:** {breadcrumb_path}") 245 md_lines.append("") 246 247 # Add module preamble documentation if available 248 if module_info["preamble"]: 249 preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser") 250 preamble_text = clean_signature_text(preamble_soup.get_text()).strip() 251 if preamble_text: 252 md_lines.append(preamble_text) 253 md_lines.append("") 254 255 # Organize signatures by type 256 sig_by_type = defaultdict(list) 257 for sig in signatures: 258 sig_by_type[sig["type"]].append(sig) 259 260 # Process types first 261 if "type" in sig_by_type: 262 md_lines.append("## Types") 263 for sig in sig_by_type["type"]: 264 md_lines.append("") 265 md_lines.append(f"### `{sig['content']}`") 266 267 # Add documentation if available 268 if sig["doc"]: 269 md_lines.append("") 270 md_lines.append(sig["doc"]) 271 md_lines.append("") 272 273 # Process exceptions 274 if "exception" in sig_by_type: 275 md_lines.append("## Exceptions") 276 for sig in sig_by_type["exception"]: 277 md_lines.append("") 278 md_lines.append(f"### `{sig['content']}`") 279 280 # Add documentation if available 281 if sig["doc"]: 282 md_lines.append("") 283 md_lines.append(sig["doc"]) 284 md_lines.append("") 285 286 # Process values (functions) 287 if "val" in sig_by_type: 288 md_lines.append("## Values") 289 for sig in sig_by_type["val"]: 290 md_lines.append("") 291 md_lines.append(f"### `{sig['content']}`") 292 293 # Add documentation if available 294 if sig["doc"]: 295 md_lines.append("") 296 md_lines.append(sig["doc"]) 297 md_lines.append("") 298 299 # Process modules 300 if "module" in sig_by_type: 301 md_lines.append("## Modules") 302 for sig in sig_by_type["module"]: 303 md_lines.append("") 304 md_lines.append(f"### `{sig['content']}`") 305 306 # Add documentation if available 307 if sig["doc"]: 308 md_lines.append("") 309 md_lines.append(sig["doc"]) 310 md_lines.append("") 311 312 # Process classes 313 if "class" in sig_by_type: 314 md_lines.append("## Classes") 315 for sig in sig_by_type["class"]: 316 md_lines.append("") 317 md_lines.append(f"### `{sig['content']}`") 318 319 # Add documentation if available 320 if sig["doc"]: 321 md_lines.append("") 322 md_lines.append(sig["doc"]) 323 md_lines.append("") 324 325 # Process remaining signature types 326 for sig_type, sigs in sig_by_type.items(): 327 if sig_type not in ["type", "val", "module", "class", "exception"]: 328 md_lines.append(f"## {sig_type.capitalize()}s") 329 for sig in sigs: 330 md_lines.append("") 331 md_lines.append(f"### `{sig['content']}`") 332 333 # Add documentation if available 334 if sig["doc"]: 335 md_lines.append("") 336 md_lines.append(sig["doc"]) 337 md_lines.append("") 338 339 return "\n".join(md_lines) 340 341 342def read_json_file(file_path): 343 """ 344 Read a JSON file with robust error handling for encoding issues. 345 346 Args: 347 file_path: Path to the JSON file 348 349 Returns: 350 Content of the JSON file as a string, or None if there was an error 351 """ 352 # Try UTF-8 first (most common encoding) 353 try: 354 with open(file_path, 'r', encoding='utf-8') as f: 355 return f.read() 356 except UnicodeDecodeError: 357 # Try other encodings if UTF-8 fails 358 try: 359 with open(file_path, 'r', encoding='latin-1') as f: 360 return f.read() 361 except Exception as e: 362 print(f"Error reading {file_path}: {str(e)}", file=sys.stderr) 363 return None 364 365 366def build_module_hierarchy(json_files, root_dir): 367 """Build a hierarchical structure from all the JSON files.""" 368 hierarchy = defaultdict(list) 369 370 for json_file in json_files: 371 rel_path = os.path.relpath(json_file, root_dir) 372 package_parts = rel_path.split(os.sep) 373 374 # Skip irrelevant JSON files 375 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]: 376 # For index.html.json, check if it's a module documentation 377 if package_parts[-1] == "index.html.json" and len(package_parts) > 1: 378 json_content = read_json_file(json_file) 379 if json_content: 380 try: 381 # Try to parse the module info 382 module_info = extract_module_info(json_content) 383 signatures = parse_module_signature(module_info["content"]) 384 385 # Determine package name and version from path 386 package_name, package_version = determine_package_info(json_file, package_parts, module_info) 387 388 # Use package name and version for the hierarchy key 389 package_key = f"{package_name}" 390 if package_version != "unknown": 391 # Add version information to module_info for display in markdown 392 module_info["package_version"] = package_version 393 394 hierarchy[package_key].append({ 395 "file": json_file, 396 "module_info": module_info, 397 "signatures": signatures, 398 "path_parts": package_parts 399 }) 400 except Exception as e: 401 print(f"Error processing {json_file}: {e}", file=sys.stderr) 402 403 continue 404 405 # Try to parse other JSON files (non-index.html.json) 406 json_content = read_json_file(json_file) 407 if json_content: 408 try: 409 module_info = extract_module_info(json_content) 410 signatures = parse_module_signature(module_info["content"]) 411 412 # Determine package name from path 413 package_name = determine_package_name(package_parts, module_info) 414 415 hierarchy[package_name].append({ 416 "file": json_file, 417 "module_info": module_info, 418 "signatures": signatures, 419 "path_parts": package_parts 420 }) 421 except Exception as e: 422 print(f"Error processing {json_file}: {e}", file=sys.stderr) 423 424 return hierarchy 425 426 427def determine_package_info(file_path, path_parts, module_info): 428 """ 429 Determine package name and version from file path and module info. 430 431 Args: 432 file_path: The full file path 433 path_parts: Parts of the path 434 module_info: Extracted module information 435 436 Returns: 437 Tuple of (package_name, package_version) 438 """ 439 package_name = "unknown" 440 package_version = "unknown" 441 442 # Try to extract from breadcrumbs if available 443 if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]): 444 for crumb in module_info["breadcrumbs"]: 445 if "Library" in crumb: 446 # Extract library name from the breadcrumb 447 match = re.search(r'Library\s+(.+)', crumb) 448 if match: 449 package_name = match.group(1).strip() 450 451 # Look for test/package-name/version pattern in the path 452 file_path_parts = Path(file_path).resolve().parts 453 for i, part in enumerate(file_path_parts): 454 if part == "test" and i + 2 < len(file_path_parts): 455 # We found a test directory, extract package name and version 456 package_name = file_path_parts[i + 1] 457 package_version = file_path_parts[i + 2] 458 break 459 460 # If still unknown, fall back to using the first part of the path 461 if package_name == "unknown" and len(path_parts) > 0: 462 package_name = path_parts[0] 463 464 # Last resort - use module name or "unknown" 465 if package_name == "unknown": 466 package_name = module_info["name"] if module_info["name"] else "unknown" 467 468 return package_name, package_version 469 470 471def sort_modules_hierarchically(modules): 472 """Sort modules to ensure proper hierarchical presentation.""" 473 # First sort by breadcrumb length (shorter = higher in hierarchy) 474 # Then sort alphabetically within the same level 475 return sorted(modules, key=lambda x: ( 476 len(x["module_info"]["breadcrumbs"]), 477 x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else "" 478 )) 479 480 481def generate_markdown_library(lib_name, modules): 482 """Generate markdown for a specific library.""" 483 md_lines = [] 484 485 md_lines.append(f"# Library: {lib_name}") 486 md_lines.append("") 487 488 # Sort modules hierarchically 489 sorted_modules = sort_modules_hierarchically(modules) 490 491 for module in sorted_modules: 492 module_md = generate_markdown(module["module_info"], module["signatures"]) 493 md_lines.append(module_md) 494 md_lines.append("\n---\n") 495 496 return "\n".join(md_lines) 497 498 499def main(): 500 """ 501 Main entry point for the script. 502 503 Usage examples: 504 505 # Process all packages in a directory 506 python odoc2llm.py /path/to/odoc/output 507 508 # Process all packages and specify output file 509 python odoc2llm.py /path/to/odoc/output --output documentation.md 510 511 # Process a specific package only 512 python odoc2llm.py /path/to/odoc/output --package package-name 513 514 # Enable verbose output 515 python odoc2llm.py /path/to/odoc/output --verbose 516 """ 517 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.') 518 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files') 519 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file') 520 parser.add_argument('--package', '-p', help='Focus on a specific package/library') 521 parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') 522 args = parser.parse_args() 523 524 html_dir = Path(args.html_dir) 525 526 if not html_dir.exists() or not html_dir.is_dir(): 527 print(f"Error: {html_dir} is not a valid directory", file=sys.stderr) 528 sys.exit(1) 529 530 # Find all JSON files 531 json_files = [] 532 for root, _, files in os.walk(html_dir): 533 for file in files: 534 if file.endswith('.html.json'): 535 json_files.append(os.path.join(root, file)) 536 537 if args.verbose: 538 print(f"Found {len(json_files)} JSON files", file=sys.stderr) 539 540 # Build module hierarchy 541 hierarchy = build_module_hierarchy(json_files, html_dir) 542 543 if args.verbose: 544 print(f"Processed {len(hierarchy)} libraries", file=sys.stderr) 545 for lib, modules in hierarchy.items(): 546 print(f" - {lib}: {len(modules)} modules", file=sys.stderr) 547 548 # Generate markdown for all or specific package 549 if args.package and args.package in hierarchy: 550 markdown = generate_markdown_library(args.package, hierarchy[args.package]) 551 else: 552 # Combine all packages 553 markdown_parts = [] 554 for lib_name, modules in sorted(hierarchy.items()): 555 if args.verbose: 556 print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr) 557 lib_md = generate_markdown_library(lib_name, modules) 558 markdown_parts.append(lib_md) 559 markdown_parts.append("\n\n") 560 561 markdown = "\n".join(markdown_parts) 562 563 # Write markdown to output file 564 with open(args.output, 'w', encoding='utf-8') as f: 565 f.write(markdown) 566 567 print(f"Generated Markdown documentation in {args.output}", file=sys.stderr) 568 569 570if __name__ == "__main__": 571 main()