#!/usr/bin/env python3 # /// script # requires-python = ">=3.11" # dependencies = [ # "bs4", # ] # /// """ odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs This script processes JSON files generated by odoc-driver (OCaml documentation generator) and produces a single Markdown file with the essential module structure and signatures formatted in a way that makes it useful for LLMs to reason about OCaml codebases. """ import os import sys import json import re from bs4 import BeautifulSoup from collections import defaultdict import argparse from pathlib import Path import html def extract_module_info(json_content): """Extract module information from odoc JSON content.""" try: data = json.loads(json_content) except json.JSONDecodeError as e: print(f"JSON decode error: {e}") # Return a minimal structure that won't cause errors downstream return { "name": "Unknown", "type": "Module", "breadcrumbs": [], "content": BeautifulSoup("", "html.parser"), "preamble": "" } # Extract module name and type from header header = data.get("header", "") soup = BeautifulSoup(header, "html.parser") header_text = soup.get_text().strip() # Determine module type and name module_type = "Module" if "Module type" in header_text: module_type = "Module type" elif "Class" in header_text: module_type = "Class" # Extract the actual module name module_name = "" code_tag = soup.find("code") if code_tag: module_name = code_tag.get_text().strip() else: # Fall back to header text with type prefix removed module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text) # Extract breadcrumbs for context breadcrumbs = [] for crumb in data.get("breadcrumbs", []): name = crumb.get("name", "") if name: soup = BeautifulSoup(name, "html.parser") clean_name = soup.get_text().strip() # Clean up the breadcrumb text clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name) breadcrumbs.append(clean_name) # Extract module content content = data.get("content", "") soup = BeautifulSoup(content, "html.parser") return { "name": module_name, "type": module_type, "breadcrumbs": breadcrumbs, "content": soup, "preamble": data.get("preamble", "") } def clean_signature_text(text): """Clean up signature text for better readability.""" # Replace special arrow characters with -> text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->') # Replace multiple spaces with a single space, except in code blocks text = re.sub(r'(? ".join(module_info["breadcrumbs"]) md_lines.append(f"# {module_info['type']} `{module_info['name']}`") md_lines.append(f"**Path:** {breadcrumb_path}") md_lines.append("") # Add module preamble documentation if available if module_info["preamble"]: preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser") preamble_text = clean_signature_text(preamble_soup.get_text()).strip() if preamble_text: md_lines.append(preamble_text) md_lines.append("") # Organize signatures by type sig_by_type = defaultdict(list) for sig in signatures: sig_by_type[sig["type"]].append(sig) # Process types first if "type" in sig_by_type: md_lines.append("## Types") for sig in sig_by_type["type"]: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") # Process exceptions if "exception" in sig_by_type: md_lines.append("## Exceptions") for sig in sig_by_type["exception"]: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") # Process values (functions) if "val" in sig_by_type: md_lines.append("## Values") for sig in sig_by_type["val"]: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") # Process modules if "module" in sig_by_type: md_lines.append("## Modules") for sig in sig_by_type["module"]: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") # Process classes if "class" in sig_by_type: md_lines.append("## Classes") for sig in sig_by_type["class"]: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") # Process remaining signature types for sig_type, sigs in sig_by_type.items(): if sig_type not in ["type", "val", "module", "class", "exception"]: md_lines.append(f"## {sig_type.capitalize()}s") for sig in sigs: md_lines.append("") md_lines.append(f"### `{sig['content']}`") # Add documentation if available if sig["doc"]: md_lines.append("") md_lines.append(sig["doc"]) md_lines.append("") return "\n".join(md_lines) def read_json_file(file_path): """ Read a JSON file with robust error handling for encoding issues. Args: file_path: Path to the JSON file Returns: Content of the JSON file as a string, or None if there was an error """ # Try UTF-8 first (most common encoding) try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: # Try other encodings if UTF-8 fails try: with open(file_path, 'r', encoding='latin-1') as f: return f.read() except Exception as e: print(f"Error reading {file_path}: {str(e)}", file=sys.stderr) return None def build_module_hierarchy(json_files, root_dir): """Build a hierarchical structure from all the JSON files.""" hierarchy = defaultdict(list) for json_file in json_files: rel_path = os.path.relpath(json_file, root_dir) package_parts = rel_path.split(os.sep) # Skip irrelevant JSON files if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]: # For index.html.json, check if it's a module documentation if package_parts[-1] == "index.html.json" and len(package_parts) > 1: json_content = read_json_file(json_file) if json_content: try: # Try to parse the module info module_info = extract_module_info(json_content) signatures = parse_module_signature(module_info["content"]) # Determine package name and version from path package_name, package_version = determine_package_info(json_file, package_parts, module_info) # Use package name and version for the hierarchy key package_key = f"{package_name}" if package_version != "unknown": # Add version information to module_info for display in markdown module_info["package_version"] = package_version hierarchy[package_key].append({ "file": json_file, "module_info": module_info, "signatures": signatures, "path_parts": package_parts }) except Exception as e: print(f"Error processing {json_file}: {e}", file=sys.stderr) continue # Try to parse other JSON files (non-index.html.json) json_content = read_json_file(json_file) if json_content: try: module_info = extract_module_info(json_content) signatures = parse_module_signature(module_info["content"]) # Determine package name from path package_name = determine_package_name(package_parts, module_info) hierarchy[package_name].append({ "file": json_file, "module_info": module_info, "signatures": signatures, "path_parts": package_parts }) except Exception as e: print(f"Error processing {json_file}: {e}", file=sys.stderr) return hierarchy def determine_package_info(file_path, path_parts, module_info): """ Determine package name and version from file path and module info. Args: file_path: The full file path path_parts: Parts of the path module_info: Extracted module information Returns: Tuple of (package_name, package_version) """ package_name = "unknown" package_version = "unknown" # Try to extract from breadcrumbs if available if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]): for crumb in module_info["breadcrumbs"]: if "Library" in crumb: # Extract library name from the breadcrumb match = re.search(r'Library\s+(.+)', crumb) if match: package_name = match.group(1).strip() # Look for test/package-name/version pattern in the path file_path_parts = Path(file_path).resolve().parts for i, part in enumerate(file_path_parts): if part == "test" and i + 2 < len(file_path_parts): # We found a test directory, extract package name and version package_name = file_path_parts[i + 1] package_version = file_path_parts[i + 2] break # If still unknown, fall back to using the first part of the path if package_name == "unknown" and len(path_parts) > 0: package_name = path_parts[0] # Last resort - use module name or "unknown" if package_name == "unknown": package_name = module_info["name"] if module_info["name"] else "unknown" return package_name, package_version def sort_modules_hierarchically(modules): """Sort modules to ensure proper hierarchical presentation.""" # First sort by breadcrumb length (shorter = higher in hierarchy) # Then sort alphabetically within the same level return sorted(modules, key=lambda x: ( len(x["module_info"]["breadcrumbs"]), x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else "" )) def generate_markdown_library(lib_name, modules): """Generate markdown for a specific library.""" md_lines = [] md_lines.append(f"# Library: {lib_name}") md_lines.append("") # Sort modules hierarchically sorted_modules = sort_modules_hierarchically(modules) for module in sorted_modules: module_md = generate_markdown(module["module_info"], module["signatures"]) md_lines.append(module_md) md_lines.append("\n---\n") return "\n".join(md_lines) def main(): """ Main entry point for the script. Usage examples: # Process all packages in a directory python odoc2llm.py /path/to/odoc/output # Process all packages and specify output file python odoc2llm.py /path/to/odoc/output --output documentation.md # Process a specific package only python odoc2llm.py /path/to/odoc/output --package package-name # Enable verbose output python odoc2llm.py /path/to/odoc/output --verbose """ parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.') parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files') parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file') parser.add_argument('--package', '-p', help='Focus on a specific package/library') parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') args = parser.parse_args() html_dir = Path(args.html_dir) if not html_dir.exists() or not html_dir.is_dir(): print(f"Error: {html_dir} is not a valid directory", file=sys.stderr) sys.exit(1) # Find all JSON files json_files = [] for root, _, files in os.walk(html_dir): for file in files: if file.endswith('.html.json'): json_files.append(os.path.join(root, file)) if args.verbose: print(f"Found {len(json_files)} JSON files", file=sys.stderr) # Build module hierarchy hierarchy = build_module_hierarchy(json_files, html_dir) if args.verbose: print(f"Processed {len(hierarchy)} libraries", file=sys.stderr) for lib, modules in hierarchy.items(): print(f" - {lib}: {len(modules)} modules", file=sys.stderr) # Generate markdown for all or specific package if args.package and args.package in hierarchy: markdown = generate_markdown_library(args.package, hierarchy[args.package]) else: # Combine all packages markdown_parts = [] for lib_name, modules in sorted(hierarchy.items()): if args.verbose: print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr) lib_md = generate_markdown_library(lib_name, modules) markdown_parts.append(lib_md) markdown_parts.append("\n\n") markdown = "\n".join(markdown_parts) # Write markdown to output file with open(args.output, 'w', encoding='utf-8') as f: f.write(markdown) print(f"Generated Markdown documentation in {args.output}", file=sys.stderr) if __name__ == "__main__": main()