···
1
+
#!/usr/bin/env python3
3
+
# requires-python = ">=3.11"
9
+
odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
11
+
This script processes JSON files generated by odoc-driver (OCaml documentation generator)
12
+
and produces a single Markdown file with the essential module structure and signatures
13
+
formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
20
+
from bs4 import BeautifulSoup
21
+
from collections import defaultdict
23
+
from pathlib import Path
27
+
def extract_module_info(json_content):
28
+
"""Extract module information from odoc JSON content."""
29
+
data = json.loads(json_content)
31
+
# Extract module name and type from header
32
+
header = data.get("header", "")
33
+
soup = BeautifulSoup(header, "html.parser")
34
+
header_text = soup.get_text().strip()
36
+
# Determine module type and name
37
+
module_type = "Module"
38
+
if "Module type" in header_text:
39
+
module_type = "Module type"
40
+
elif "Class" in header_text:
41
+
module_type = "Class"
43
+
# Extract the actual module name
45
+
code_tag = soup.find("code")
47
+
module_name = code_tag.get_text().strip()
49
+
# Fall back to header text with type prefix removed
50
+
module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
52
+
# Extract breadcrumbs for context
54
+
for crumb in data.get("breadcrumbs", []):
55
+
name = crumb.get("name", "")
57
+
soup = BeautifulSoup(name, "html.parser")
58
+
clean_name = soup.get_text().strip()
59
+
# Clean up the breadcrumb text
60
+
clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
61
+
breadcrumbs.append(clean_name)
63
+
# Extract module content
64
+
content = data.get("content", "")
65
+
soup = BeautifulSoup(content, "html.parser")
68
+
"name": module_name,
69
+
"type": module_type,
70
+
"breadcrumbs": breadcrumbs,
72
+
"preamble": data.get("preamble", "")
76
+
def clean_signature_text(text):
77
+
"""Clean up signature text for better readability."""
78
+
# Replace special arrow characters with ->
79
+
text = text.replace('', '').replace('−', '-').replace('‑', '-').replace('→', '->')
81
+
# Replace multiple spaces with a single space, except in code blocks
82
+
text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
87
+
def extract_signature_name(sig_content):
88
+
"""Extract the name of a signature (function name, type name, etc.)."""
89
+
# For val signatures: extract function name before the first :
90
+
match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
92
+
return match.group(1)
94
+
# For type signatures: extract type name
95
+
match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
97
+
return match.group(1)
99
+
# For module signatures: extract module name
100
+
match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
102
+
return match.group(1)
104
+
# For class signatures: extract class name
105
+
match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
107
+
return match.group(1)
109
+
# For exception signatures: extract exception name
110
+
match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
112
+
return match.group(1)
117
+
def parse_module_signature(content_soup):
118
+
"""Parse the OCaml module signature from the HTML content."""
121
+
# Get all the odoc-spec divs
122
+
spec_divs = content_soup.find_all("div", class_="odoc-spec")
124
+
for spec in spec_divs:
130
+
# Find the actual signature
131
+
sig_div = spec.find("div", class_="spec")
133
+
# Get the ID for cross-referencing
134
+
sig_id = sig_div.get("id", "")
136
+
# Determine the type of signature (type, val, module, etc.)
137
+
sig_type_span = sig_div.find("span", class_="keyword")
139
+
sig_type = sig_type_span.get_text().strip()
141
+
# Get the full code content
142
+
code_tag = sig_div.find("code")
144
+
# Extract the full OCaml signature text properly
145
+
# We'll convert all spans to plain text while preserving structure
146
+
for span in code_tag.find_all("span"):
147
+
span.replace_with(span.get_text())
149
+
sig_content = clean_signature_text(code_tag.get_text())
151
+
# Find documentation for this signature
152
+
doc_div = spec.find("div", class_="spec-doc")
154
+
# Process paragraphs and lists for documentation
157
+
# Process regular paragraphs
158
+
for p in doc_div.find_all("p"):
159
+
# Clean up code references in paragraph
160
+
for code in p.find_all("code"):
161
+
# Convert links within code tags to plain text
162
+
for a in code.find_all("a"):
163
+
a.replace_with(a.get_text())
164
+
# Keep the code tag formatting
165
+
code_text = code.get_text()
166
+
code.string = code_text
168
+
# Clean up the paragraph text
169
+
p_text = clean_signature_text(p.get_text()).strip()
171
+
doc_parts.append(p_text)
173
+
# Process bulleted lists
174
+
for ul in doc_div.find_all("ul"):
175
+
for li in ul.find_all("li"):
176
+
# Check if it's a special tag like @raises, @returns, etc.
177
+
tag_span = li.find("span", class_="at-tag")
179
+
tag_name = tag_span.get_text().strip()
180
+
# Remove the tag span from consideration
182
+
# Get the rest of the content
183
+
li_text = clean_signature_text(li.get_text()).strip()
184
+
doc_parts.append(f"@{tag_name} {li_text}")
186
+
# Regular list item
187
+
li_text = clean_signature_text(li.get_text()).strip()
188
+
doc_parts.append(f"- {li_text}")
190
+
# Process code examples
191
+
for pre in doc_div.find_all("pre"):
192
+
code = pre.find("code")
194
+
# Get the language class if available
195
+
lang = "ocaml" # Default to OCaml
196
+
if "language-" in code.get("class", [""]):
197
+
for cls in code.get("class", []):
198
+
if cls.startswith("language-"):
199
+
lang = cls.replace("language-", "")
201
+
# Preserve indentation and line breaks in code blocks
202
+
code_text = code.get_text()
203
+
doc_parts.append(f"```{lang}\n{code_text}\n```")
206
+
doc_content = "\n".join(doc_parts)
208
+
# Only add signatures that have content
209
+
if sig_type and sig_content:
210
+
# Extract the name of the element (function name, type name, etc.)
211
+
name = extract_signature_name(sig_content)
213
+
# Build the full signature
218
+
"content": sig_content,
221
+
signatures.append(signature)
226
+
def generate_markdown(module_info, signatures):
227
+
"""Generate markdown documentation from parsed module information."""
230
+
# Module header with breadcrumbs
231
+
breadcrumb_path = " > ".join(module_info["breadcrumbs"])
232
+
md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
233
+
md_lines.append(f"**Path:** {breadcrumb_path}")
234
+
md_lines.append("")
236
+
# Add module preamble documentation if available
237
+
if module_info["preamble"]:
238
+
preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
239
+
preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
241
+
md_lines.append(preamble_text)
242
+
md_lines.append("")
244
+
# Organize signatures by type
245
+
sig_by_type = defaultdict(list)
246
+
for sig in signatures:
247
+
sig_by_type[sig["type"]].append(sig)
249
+
# Process types first
250
+
if "type" in sig_by_type:
251
+
md_lines.append("## Types")
252
+
for sig in sig_by_type["type"]:
253
+
md_lines.append("")
254
+
md_lines.append(f"### `{sig['content']}`")
256
+
# Add documentation if available
258
+
md_lines.append("")
259
+
md_lines.append(sig["doc"])
260
+
md_lines.append("")
262
+
# Process exceptions
263
+
if "exception" in sig_by_type:
264
+
md_lines.append("## Exceptions")
265
+
for sig in sig_by_type["exception"]:
266
+
md_lines.append("")
267
+
md_lines.append(f"### `{sig['content']}`")
269
+
# Add documentation if available
271
+
md_lines.append("")
272
+
md_lines.append(sig["doc"])
273
+
md_lines.append("")
275
+
# Process values (functions)
276
+
if "val" in sig_by_type:
277
+
md_lines.append("## Values")
278
+
for sig in sig_by_type["val"]:
279
+
md_lines.append("")
280
+
md_lines.append(f"### `{sig['content']}`")
282
+
# Add documentation if available
284
+
md_lines.append("")
285
+
md_lines.append(sig["doc"])
286
+
md_lines.append("")
289
+
if "module" in sig_by_type:
290
+
md_lines.append("## Modules")
291
+
for sig in sig_by_type["module"]:
292
+
md_lines.append("")
293
+
md_lines.append(f"### `{sig['content']}`")
295
+
# Add documentation if available
297
+
md_lines.append("")
298
+
md_lines.append(sig["doc"])
299
+
md_lines.append("")
302
+
if "class" in sig_by_type:
303
+
md_lines.append("## Classes")
304
+
for sig in sig_by_type["class"]:
305
+
md_lines.append("")
306
+
md_lines.append(f"### `{sig['content']}`")
308
+
# Add documentation if available
310
+
md_lines.append("")
311
+
md_lines.append(sig["doc"])
312
+
md_lines.append("")
314
+
# Process remaining signature types
315
+
for sig_type, sigs in sig_by_type.items():
316
+
if sig_type not in ["type", "val", "module", "class", "exception"]:
317
+
md_lines.append(f"## {sig_type.capitalize()}s")
319
+
md_lines.append("")
320
+
md_lines.append(f"### `{sig['content']}`")
322
+
# Add documentation if available
324
+
md_lines.append("")
325
+
md_lines.append(sig["doc"])
326
+
md_lines.append("")
328
+
return "\n".join(md_lines)
331
+
def build_module_hierarchy(json_files, root_dir):
332
+
"""Build a hierarchical structure from all the JSON files."""
333
+
hierarchy = defaultdict(list)
335
+
for json_file in json_files:
336
+
rel_path = os.path.relpath(json_file, root_dir)
337
+
package_parts = rel_path.split(os.sep)
339
+
# Skip irrelevant JSON files
340
+
if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
341
+
# For index.html.json, check if it's a module documentation
342
+
if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
344
+
with open(json_file, 'r', encoding='utf-8') as f:
345
+
json_content = f.read()
347
+
# Try to parse the module info
348
+
module_info = extract_module_info(json_content)
349
+
signatures = parse_module_signature(module_info["content"])
351
+
# Group by package/library
352
+
if len(package_parts) > 1:
353
+
package_name = package_parts[0]
354
+
hierarchy[package_name].append({
356
+
"module_info": module_info,
357
+
"signatures": signatures,
358
+
"path_parts": package_parts
360
+
except Exception as e:
361
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
365
+
# Try to parse other JSON files (non-index.html.json)
367
+
with open(json_file, 'r', encoding='utf-8') as f:
368
+
json_content = f.read()
370
+
module_info = extract_module_info(json_content)
371
+
signatures = parse_module_signature(module_info["content"])
373
+
# Group by package/library
374
+
if len(package_parts) > 1:
375
+
package_name = package_parts[0]
376
+
hierarchy[package_name].append({
378
+
"module_info": module_info,
379
+
"signatures": signatures,
380
+
"path_parts": package_parts
382
+
except Exception as e:
383
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
388
+
def sort_modules_hierarchically(modules):
389
+
"""Sort modules to ensure proper hierarchical presentation."""
390
+
# First sort by breadcrumb length (shorter = higher in hierarchy)
391
+
# Then sort alphabetically within the same level
392
+
return sorted(modules, key=lambda x: (
393
+
len(x["module_info"]["breadcrumbs"]),
394
+
x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
398
+
def generate_markdown_library(lib_name, modules):
399
+
"""Generate markdown for a specific library."""
402
+
md_lines.append(f"# Library: {lib_name}")
403
+
md_lines.append("")
405
+
# Sort modules hierarchically
406
+
sorted_modules = sort_modules_hierarchically(modules)
408
+
for module in sorted_modules:
409
+
module_md = generate_markdown(module["module_info"], module["signatures"])
410
+
md_lines.append(module_md)
411
+
md_lines.append("\n---\n")
413
+
return "\n".join(md_lines)
417
+
parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
418
+
parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
419
+
parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
420
+
parser.add_argument('--package', '-p', help='Focus on a specific package/library')
421
+
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
422
+
args = parser.parse_args()
424
+
html_dir = Path(args.html_dir)
426
+
if not html_dir.exists() or not html_dir.is_dir():
427
+
print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
430
+
# Find all JSON files
432
+
for root, _, files in os.walk(html_dir):
434
+
if file.endswith('.html.json'):
435
+
json_files.append(os.path.join(root, file))
438
+
print(f"Found {len(json_files)} JSON files", file=sys.stderr)
440
+
# Build module hierarchy
441
+
hierarchy = build_module_hierarchy(json_files, html_dir)
444
+
print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
445
+
for lib, modules in hierarchy.items():
446
+
print(f" - {lib}: {len(modules)} modules", file=sys.stderr)
448
+
# Generate markdown for all or specific package
449
+
if args.package and args.package in hierarchy:
450
+
markdown = generate_markdown_library(args.package, hierarchy[args.package])
452
+
# Combine all packages
453
+
markdown_parts = []
454
+
for lib_name, modules in sorted(hierarchy.items()):
456
+
print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
457
+
lib_md = generate_markdown_library(lib_name, modules)
458
+
markdown_parts.append(lib_md)
459
+
markdown_parts.append("\n\n")
461
+
markdown = "\n".join(markdown_parts)
463
+
# Write markdown to output file
464
+
with open(args.output, 'w', encoding='utf-8') as f:
467
+
print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
470
+
if __name__ == "__main__":