Take OCaml odoc output into MCP

vibe

Changed files
+477
+6
CLAUDE.md
···
+
I wish to turn JSON files output by odoc-driver (an OCaml documentation generator) into succinct Markdown that is a │
+
good input to a coding model such as you. Look at │
+
_html/mirage-crypto/mirage-crypto/Mirage_crypto/DES/CTR/index.html.json as one such example, with more being in _html/ │
+
but be aware there are thousands of files. Write me a odoc2llm.py Python script that uses Beautiful Soup and JSON │
+
parsing to crunch up just the relevant signatures and crosslinks into a _single_ markdown file from the _html │
+
directory
+471
odoc2llm.py
···
+
#!/usr/bin/env python3
+
# /// script
+
# requires-python = ">=3.11"
+
# dependencies = [
+
# "bs4",
+
# ]
+
# ///
+
"""
+
odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
+
+
This script processes JSON files generated by odoc-driver (OCaml documentation generator)
+
and produces a single Markdown file with the essential module structure and signatures
+
formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
+
"""
+
+
import os
+
import sys
+
import json
+
import re
+
from bs4 import BeautifulSoup
+
from collections import defaultdict
+
import argparse
+
from pathlib import Path
+
import html
+
+
+
def extract_module_info(json_content):
+
"""Extract module information from odoc JSON content."""
+
data = json.loads(json_content)
+
+
# Extract module name and type from header
+
header = data.get("header", "")
+
soup = BeautifulSoup(header, "html.parser")
+
header_text = soup.get_text().strip()
+
+
# Determine module type and name
+
module_type = "Module"
+
if "Module type" in header_text:
+
module_type = "Module type"
+
elif "Class" in header_text:
+
module_type = "Class"
+
+
# Extract the actual module name
+
module_name = ""
+
code_tag = soup.find("code")
+
if code_tag:
+
module_name = code_tag.get_text().strip()
+
else:
+
# Fall back to header text with type prefix removed
+
module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
+
+
# Extract breadcrumbs for context
+
breadcrumbs = []
+
for crumb in data.get("breadcrumbs", []):
+
name = crumb.get("name", "")
+
if name:
+
soup = BeautifulSoup(name, "html.parser")
+
clean_name = soup.get_text().strip()
+
# Clean up the breadcrumb text
+
clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
+
breadcrumbs.append(clean_name)
+
+
# Extract module content
+
content = data.get("content", "")
+
soup = BeautifulSoup(content, "html.parser")
+
+
return {
+
"name": module_name,
+
"type": module_type,
+
"breadcrumbs": breadcrumbs,
+
"content": soup,
+
"preamble": data.get("preamble", "")
+
}
+
+
+
def clean_signature_text(text):
+
"""Clean up signature text for better readability."""
+
# Replace special arrow characters with ->
+
text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->')
+
+
# Replace multiple spaces with a single space, except in code blocks
+
text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
+
+
return text
+
+
+
def extract_signature_name(sig_content):
+
"""Extract the name of a signature (function name, type name, etc.)."""
+
# For val signatures: extract function name before the first :
+
match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
+
if match:
+
return match.group(1)
+
+
# For type signatures: extract type name
+
match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
+
if match:
+
return match.group(1)
+
+
# For module signatures: extract module name
+
match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
+
if match:
+
return match.group(1)
+
+
# For class signatures: extract class name
+
match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
+
if match:
+
return match.group(1)
+
+
# For exception signatures: extract exception name
+
match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
+
if match:
+
return match.group(1)
+
+
return None
+
+
+
def parse_module_signature(content_soup):
+
"""Parse the OCaml module signature from the HTML content."""
+
signatures = []
+
+
# Get all the odoc-spec divs
+
spec_divs = content_soup.find_all("div", class_="odoc-spec")
+
+
for spec in spec_divs:
+
sig_id = None
+
sig_type = None
+
sig_content = None
+
doc_content = None
+
+
# Find the actual signature
+
sig_div = spec.find("div", class_="spec")
+
if sig_div:
+
# Get the ID for cross-referencing
+
sig_id = sig_div.get("id", "")
+
+
# Determine the type of signature (type, val, module, etc.)
+
sig_type_span = sig_div.find("span", class_="keyword")
+
if sig_type_span:
+
sig_type = sig_type_span.get_text().strip()
+
+
# Get the full code content
+
code_tag = sig_div.find("code")
+
if code_tag:
+
# Extract the full OCaml signature text properly
+
# We'll convert all spans to plain text while preserving structure
+
for span in code_tag.find_all("span"):
+
span.replace_with(span.get_text())
+
+
sig_content = clean_signature_text(code_tag.get_text())
+
+
# Find documentation for this signature
+
doc_div = spec.find("div", class_="spec-doc")
+
if doc_div:
+
# Process paragraphs and lists for documentation
+
doc_parts = []
+
+
# Process regular paragraphs
+
for p in doc_div.find_all("p"):
+
# Clean up code references in paragraph
+
for code in p.find_all("code"):
+
# Convert links within code tags to plain text
+
for a in code.find_all("a"):
+
a.replace_with(a.get_text())
+
# Keep the code tag formatting
+
code_text = code.get_text()
+
code.string = code_text
+
+
# Clean up the paragraph text
+
p_text = clean_signature_text(p.get_text()).strip()
+
if p_text:
+
doc_parts.append(p_text)
+
+
# Process bulleted lists
+
for ul in doc_div.find_all("ul"):
+
for li in ul.find_all("li"):
+
# Check if it's a special tag like @raises, @returns, etc.
+
tag_span = li.find("span", class_="at-tag")
+
if tag_span:
+
tag_name = tag_span.get_text().strip()
+
# Remove the tag span from consideration
+
tag_span.extract()
+
# Get the rest of the content
+
li_text = clean_signature_text(li.get_text()).strip()
+
doc_parts.append(f"@{tag_name} {li_text}")
+
else:
+
# Regular list item
+
li_text = clean_signature_text(li.get_text()).strip()
+
doc_parts.append(f"- {li_text}")
+
+
# Process code examples
+
for pre in doc_div.find_all("pre"):
+
code = pre.find("code")
+
if code:
+
# Get the language class if available
+
lang = "ocaml" # Default to OCaml
+
if "language-" in code.get("class", [""]):
+
for cls in code.get("class", []):
+
if cls.startswith("language-"):
+
lang = cls.replace("language-", "")
+
+
# Preserve indentation and line breaks in code blocks
+
code_text = code.get_text()
+
doc_parts.append(f"```{lang}\n{code_text}\n```")
+
+
if doc_parts:
+
doc_content = "\n".join(doc_parts)
+
+
# Only add signatures that have content
+
if sig_type and sig_content:
+
# Extract the name of the element (function name, type name, etc.)
+
name = extract_signature_name(sig_content)
+
+
# Build the full signature
+
signature = {
+
"id": sig_id,
+
"type": sig_type,
+
"name": name,
+
"content": sig_content,
+
"doc": doc_content
+
}
+
signatures.append(signature)
+
+
return signatures
+
+
+
def generate_markdown(module_info, signatures):
+
"""Generate markdown documentation from parsed module information."""
+
md_lines = []
+
+
# Module header with breadcrumbs
+
breadcrumb_path = " > ".join(module_info["breadcrumbs"])
+
md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
+
md_lines.append(f"**Path:** {breadcrumb_path}")
+
md_lines.append("")
+
+
# Add module preamble documentation if available
+
if module_info["preamble"]:
+
preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
+
preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
+
if preamble_text:
+
md_lines.append(preamble_text)
+
md_lines.append("")
+
+
# Organize signatures by type
+
sig_by_type = defaultdict(list)
+
for sig in signatures:
+
sig_by_type[sig["type"]].append(sig)
+
+
# Process types first
+
if "type" in sig_by_type:
+
md_lines.append("## Types")
+
for sig in sig_by_type["type"]:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
# Process exceptions
+
if "exception" in sig_by_type:
+
md_lines.append("## Exceptions")
+
for sig in sig_by_type["exception"]:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
# Process values (functions)
+
if "val" in sig_by_type:
+
md_lines.append("## Values")
+
for sig in sig_by_type["val"]:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
# Process modules
+
if "module" in sig_by_type:
+
md_lines.append("## Modules")
+
for sig in sig_by_type["module"]:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
# Process classes
+
if "class" in sig_by_type:
+
md_lines.append("## Classes")
+
for sig in sig_by_type["class"]:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
# Process remaining signature types
+
for sig_type, sigs in sig_by_type.items():
+
if sig_type not in ["type", "val", "module", "class", "exception"]:
+
md_lines.append(f"## {sig_type.capitalize()}s")
+
for sig in sigs:
+
md_lines.append("")
+
md_lines.append(f"### `{sig['content']}`")
+
+
# Add documentation if available
+
if sig["doc"]:
+
md_lines.append("")
+
md_lines.append(sig["doc"])
+
md_lines.append("")
+
+
return "\n".join(md_lines)
+
+
+
def build_module_hierarchy(json_files, root_dir):
+
"""Build a hierarchical structure from all the JSON files."""
+
hierarchy = defaultdict(list)
+
+
for json_file in json_files:
+
rel_path = os.path.relpath(json_file, root_dir)
+
package_parts = rel_path.split(os.sep)
+
+
# Skip irrelevant JSON files
+
if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
+
# For index.html.json, check if it's a module documentation
+
if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
+
try:
+
with open(json_file, 'r', encoding='utf-8') as f:
+
json_content = f.read()
+
+
# Try to parse the module info
+
module_info = extract_module_info(json_content)
+
signatures = parse_module_signature(module_info["content"])
+
+
# Group by package/library
+
if len(package_parts) > 1:
+
package_name = package_parts[0]
+
hierarchy[package_name].append({
+
"file": json_file,
+
"module_info": module_info,
+
"signatures": signatures,
+
"path_parts": package_parts
+
})
+
except Exception as e:
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
+
+
continue
+
+
# Try to parse other JSON files (non-index.html.json)
+
try:
+
with open(json_file, 'r', encoding='utf-8') as f:
+
json_content = f.read()
+
+
module_info = extract_module_info(json_content)
+
signatures = parse_module_signature(module_info["content"])
+
+
# Group by package/library
+
if len(package_parts) > 1:
+
package_name = package_parts[0]
+
hierarchy[package_name].append({
+
"file": json_file,
+
"module_info": module_info,
+
"signatures": signatures,
+
"path_parts": package_parts
+
})
+
except Exception as e:
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
+
+
return hierarchy
+
+
+
def sort_modules_hierarchically(modules):
+
"""Sort modules to ensure proper hierarchical presentation."""
+
# First sort by breadcrumb length (shorter = higher in hierarchy)
+
# Then sort alphabetically within the same level
+
return sorted(modules, key=lambda x: (
+
len(x["module_info"]["breadcrumbs"]),
+
x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
+
))
+
+
+
def generate_markdown_library(lib_name, modules):
+
"""Generate markdown for a specific library."""
+
md_lines = []
+
+
md_lines.append(f"# Library: {lib_name}")
+
md_lines.append("")
+
+
# Sort modules hierarchically
+
sorted_modules = sort_modules_hierarchically(modules)
+
+
for module in sorted_modules:
+
module_md = generate_markdown(module["module_info"], module["signatures"])
+
md_lines.append(module_md)
+
md_lines.append("\n---\n")
+
+
return "\n".join(md_lines)
+
+
+
def main():
+
parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
+
parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
+
parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
+
parser.add_argument('--package', '-p', help='Focus on a specific package/library')
+
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
+
args = parser.parse_args()
+
+
html_dir = Path(args.html_dir)
+
+
if not html_dir.exists() or not html_dir.is_dir():
+
print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
+
sys.exit(1)
+
+
# Find all JSON files
+
json_files = []
+
for root, _, files in os.walk(html_dir):
+
for file in files:
+
if file.endswith('.html.json'):
+
json_files.append(os.path.join(root, file))
+
+
if args.verbose:
+
print(f"Found {len(json_files)} JSON files", file=sys.stderr)
+
+
# Build module hierarchy
+
hierarchy = build_module_hierarchy(json_files, html_dir)
+
+
if args.verbose:
+
print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
+
for lib, modules in hierarchy.items():
+
print(f" - {lib}: {len(modules)} modules", file=sys.stderr)
+
+
# Generate markdown for all or specific package
+
if args.package and args.package in hierarchy:
+
markdown = generate_markdown_library(args.package, hierarchy[args.package])
+
else:
+
# Combine all packages
+
markdown_parts = []
+
for lib_name, modules in sorted(hierarchy.items()):
+
if args.verbose:
+
print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
+
lib_md = generate_markdown_library(lib_name, modules)
+
markdown_parts.append(lib_md)
+
markdown_parts.append("\n\n")
+
+
markdown = "\n".join(markdown_parts)
+
+
# Write markdown to output file
+
with open(args.output, 'w', encoding='utf-8') as f:
+
f.write(markdown)
+
+
print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
+
+
+
if __name__ == "__main__":
+
main()