odoc2llm.py at a5e648ac42c4cb5f9bef33a90566988a9831af94 · anil.recoil.org/odoc-mcp

anil.recoil.org / odoc-mcp
Take OCaml odoc output into MCP
odoc-mcp / odoc2llm.py
at a5e648ac42c4cb5f9bef33a90566988a9831af94 17 kB view raw
  1#!/usr/bin/env python3
  2# /// script
  3# requires-python = ">=3.11"
  4# dependencies = [
  5#   "bs4",
  6# ]
  7# ///
  8"""
  9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
 10
 11This script processes JSON files generated by odoc-driver (OCaml documentation generator)
 12and produces a single Markdown file with the essential module structure and signatures
 13formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
 14"""
 15
 16import os
 17import sys
 18import json
 19import re
 20from bs4 import BeautifulSoup
 21from collections import defaultdict
 22import argparse
 23from pathlib import Path
 24import html
 25
 26
 27def extract_module_info(json_content):
 28    """Extract module information from odoc JSON content."""
 29    data = json.loads(json_content)
 30    
 31    # Extract module name and type from header
 32    header = data.get("header", "")
 33    soup = BeautifulSoup(header, "html.parser")
 34    header_text = soup.get_text().strip()
 35    
 36    # Determine module type and name
 37    module_type = "Module"
 38    if "Module type" in header_text:
 39        module_type = "Module type"
 40    elif "Class" in header_text:
 41        module_type = "Class"
 42    
 43    # Extract the actual module name
 44    module_name = ""
 45    code_tag = soup.find("code")
 46    if code_tag:
 47        module_name = code_tag.get_text().strip()
 48    else:
 49        # Fall back to header text with type prefix removed
 50        module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
 51    
 52    # Extract breadcrumbs for context
 53    breadcrumbs = []
 54    for crumb in data.get("breadcrumbs", []):
 55        name = crumb.get("name", "")
 56        if name:
 57            soup = BeautifulSoup(name, "html.parser")
 58            clean_name = soup.get_text().strip()
 59            # Clean up the breadcrumb text
 60            clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
 61            breadcrumbs.append(clean_name)
 62    
 63    # Extract module content
 64    content = data.get("content", "")
 65    soup = BeautifulSoup(content, "html.parser")
 66    
 67    return {
 68        "name": module_name,
 69        "type": module_type,
 70        "breadcrumbs": breadcrumbs,
 71        "content": soup,
 72        "preamble": data.get("preamble", "")
 73    }
 74
 75
 76def clean_signature_text(text):
 77    """Clean up signature text for better readability."""
 78    # Replace special arrow characters with ->
 79    text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->')
 80    
 81    # Replace multiple spaces with a single space, except in code blocks
 82    text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
 83    
 84    return text
 85
 86
 87def extract_signature_name(sig_content):
 88    """Extract the name of a signature (function name, type name, etc.)."""
 89    # For val signatures: extract function name before the first :
 90    match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
 91    if match:
 92        return match.group(1)
 93    
 94    # For type signatures: extract type name
 95    match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
 96    if match:
 97        return match.group(1)
 98    
 99    # For module signatures: extract module name
100    match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
101    if match:
102        return match.group(1)
103    
104    # For class signatures: extract class name
105    match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
106    if match:
107        return match.group(1)
108    
109    # For exception signatures: extract exception name
110    match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
111    if match:
112        return match.group(1)
113    
114    return None
115
116
117def parse_module_signature(content_soup):
118    """Parse the OCaml module signature from the HTML content."""
119    signatures = []
120    
121    # Get all the odoc-spec divs
122    spec_divs = content_soup.find_all("div", class_="odoc-spec")
123    
124    for spec in spec_divs:
125        sig_id = None
126        sig_type = None
127        sig_content = None
128        doc_content = None
129        
130        # Find the actual signature
131        sig_div = spec.find("div", class_="spec")
132        if sig_div:
133            # Get the ID for cross-referencing
134            sig_id = sig_div.get("id", "")
135            
136            # Determine the type of signature (type, val, module, etc.)
137            sig_type_span = sig_div.find("span", class_="keyword")
138            if sig_type_span:
139                sig_type = sig_type_span.get_text().strip()
140            
141            # Get the full code content
142            code_tag = sig_div.find("code")
143            if code_tag:
144                # Extract the full OCaml signature text properly
145                # We'll convert all spans to plain text while preserving structure
146                for span in code_tag.find_all("span"):
147                    span.replace_with(span.get_text())
148                
149                sig_content = clean_signature_text(code_tag.get_text())
150        
151        # Find documentation for this signature
152        doc_div = spec.find("div", class_="spec-doc")
153        if doc_div:
154            # Process paragraphs and lists for documentation
155            doc_parts = []
156            
157            # Process regular paragraphs
158            for p in doc_div.find_all("p"):
159                # Clean up code references in paragraph
160                for code in p.find_all("code"):
161                    # Convert links within code tags to plain text
162                    for a in code.find_all("a"):
163                        a.replace_with(a.get_text())
164                    # Keep the code tag formatting
165                    code_text = code.get_text()
166                    code.string = code_text
167                
168                # Clean up the paragraph text
169                p_text = clean_signature_text(p.get_text()).strip()
170                if p_text:
171                    doc_parts.append(p_text)
172            
173            # Process bulleted lists
174            for ul in doc_div.find_all("ul"):
175                for li in ul.find_all("li"):
176                    # Check if it's a special tag like @raises, @returns, etc.
177                    tag_span = li.find("span", class_="at-tag")
178                    if tag_span:
179                        tag_name = tag_span.get_text().strip()
180                        # Remove the tag span from consideration
181                        tag_span.extract()
182                        # Get the rest of the content
183                        li_text = clean_signature_text(li.get_text()).strip()
184                        doc_parts.append(f"@{tag_name} {li_text}")
185                    else:
186                        # Regular list item
187                        li_text = clean_signature_text(li.get_text()).strip()
188                        doc_parts.append(f"- {li_text}")
189            
190            # Process code examples
191            for pre in doc_div.find_all("pre"):
192                code = pre.find("code")
193                if code:
194                    # Get the language class if available
195                    lang = "ocaml"  # Default to OCaml
196                    if "language-" in code.get("class", [""]):
197                        for cls in code.get("class", []):
198                            if cls.startswith("language-"):
199                                lang = cls.replace("language-", "")
200                    
201                    # Preserve indentation and line breaks in code blocks
202                    code_text = code.get_text()
203                    doc_parts.append(f"```{lang}\n{code_text}\n```")
204            
205            if doc_parts:
206                doc_content = "\n".join(doc_parts)
207        
208        # Only add signatures that have content
209        if sig_type and sig_content:
210            # Extract the name of the element (function name, type name, etc.)
211            name = extract_signature_name(sig_content)
212            
213            # Build the full signature
214            signature = {
215                "id": sig_id,
216                "type": sig_type,
217                "name": name,
218                "content": sig_content,
219                "doc": doc_content
220            }
221            signatures.append(signature)
222    
223    return signatures
224
225
226def generate_markdown(module_info, signatures):
227    """Generate markdown documentation from parsed module information."""
228    md_lines = []
229    
230    # Module header with breadcrumbs
231    breadcrumb_path = " > ".join(module_info["breadcrumbs"])
232    md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
233    md_lines.append(f"**Path:** {breadcrumb_path}")
234    md_lines.append("")
235    
236    # Add module preamble documentation if available
237    if module_info["preamble"]:
238        preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
239        preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
240        if preamble_text:
241            md_lines.append(preamble_text)
242            md_lines.append("")
243    
244    # Organize signatures by type
245    sig_by_type = defaultdict(list)
246    for sig in signatures:
247        sig_by_type[sig["type"]].append(sig)
248    
249    # Process types first
250    if "type" in sig_by_type:
251        md_lines.append("## Types")
252        for sig in sig_by_type["type"]:
253            md_lines.append("")
254            md_lines.append(f"### `{sig['content']}`")
255            
256            # Add documentation if available
257            if sig["doc"]:
258                md_lines.append("")
259                md_lines.append(sig["doc"])
260        md_lines.append("")
261    
262    # Process exceptions
263    if "exception" in sig_by_type:
264        md_lines.append("## Exceptions")
265        for sig in sig_by_type["exception"]:
266            md_lines.append("")
267            md_lines.append(f"### `{sig['content']}`")
268            
269            # Add documentation if available
270            if sig["doc"]:
271                md_lines.append("")
272                md_lines.append(sig["doc"])
273        md_lines.append("")
274    
275    # Process values (functions)
276    if "val" in sig_by_type:
277        md_lines.append("## Values")
278        for sig in sig_by_type["val"]:
279            md_lines.append("")
280            md_lines.append(f"### `{sig['content']}`")
281            
282            # Add documentation if available
283            if sig["doc"]:
284                md_lines.append("")
285                md_lines.append(sig["doc"])
286        md_lines.append("")
287    
288    # Process modules
289    if "module" in sig_by_type:
290        md_lines.append("## Modules")
291        for sig in sig_by_type["module"]:
292            md_lines.append("")
293            md_lines.append(f"### `{sig['content']}`")
294            
295            # Add documentation if available
296            if sig["doc"]:
297                md_lines.append("")
298                md_lines.append(sig["doc"])
299        md_lines.append("")
300    
301    # Process classes
302    if "class" in sig_by_type:
303        md_lines.append("## Classes")
304        for sig in sig_by_type["class"]:
305            md_lines.append("")
306            md_lines.append(f"### `{sig['content']}`")
307            
308            # Add documentation if available
309            if sig["doc"]:
310                md_lines.append("")
311                md_lines.append(sig["doc"])
312        md_lines.append("")
313    
314    # Process remaining signature types
315    for sig_type, sigs in sig_by_type.items():
316        if sig_type not in ["type", "val", "module", "class", "exception"]:
317            md_lines.append(f"## {sig_type.capitalize()}s")
318            for sig in sigs:
319                md_lines.append("")
320                md_lines.append(f"### `{sig['content']}`")
321                
322                # Add documentation if available
323                if sig["doc"]:
324                    md_lines.append("")
325                    md_lines.append(sig["doc"])
326            md_lines.append("")
327    
328    return "\n".join(md_lines)
329
330
331def build_module_hierarchy(json_files, root_dir):
332    """Build a hierarchical structure from all the JSON files."""
333    hierarchy = defaultdict(list)
334    
335    for json_file in json_files:
336        rel_path = os.path.relpath(json_file, root_dir)
337        package_parts = rel_path.split(os.sep)
338        
339        # Skip irrelevant JSON files
340        if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
341            # For index.html.json, check if it's a module documentation
342            if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
343                try:
344                    with open(json_file, 'r', encoding='utf-8') as f:
345                        json_content = f.read()
346                    
347                    # Try to parse the module info
348                    module_info = extract_module_info(json_content)
349                    signatures = parse_module_signature(module_info["content"])
350                    
351                    # Group by package/library
352                    if len(package_parts) > 1:
353                        package_name = package_parts[0]
354                        hierarchy[package_name].append({
355                            "file": json_file,
356                            "module_info": module_info,
357                            "signatures": signatures,
358                            "path_parts": package_parts
359                        })
360                except Exception as e:
361                    print(f"Error processing {json_file}: {e}", file=sys.stderr)
362            
363            continue
364        
365        # Try to parse other JSON files (non-index.html.json)
366        try:
367            with open(json_file, 'r', encoding='utf-8') as f:
368                json_content = f.read()
369            
370            module_info = extract_module_info(json_content)
371            signatures = parse_module_signature(module_info["content"])
372            
373            # Group by package/library
374            if len(package_parts) > 1:
375                package_name = package_parts[0]
376                hierarchy[package_name].append({
377                    "file": json_file,
378                    "module_info": module_info,
379                    "signatures": signatures,
380                    "path_parts": package_parts
381                })
382        except Exception as e:
383            print(f"Error processing {json_file}: {e}", file=sys.stderr)
384    
385    return hierarchy
386
387
388def sort_modules_hierarchically(modules):
389    """Sort modules to ensure proper hierarchical presentation."""
390    # First sort by breadcrumb length (shorter = higher in hierarchy)
391    # Then sort alphabetically within the same level
392    return sorted(modules, key=lambda x: (
393        len(x["module_info"]["breadcrumbs"]), 
394        x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
395    ))
396
397
398def generate_markdown_library(lib_name, modules):
399    """Generate markdown for a specific library."""
400    md_lines = []
401    
402    md_lines.append(f"# Library: {lib_name}")
403    md_lines.append("")
404    
405    # Sort modules hierarchically
406    sorted_modules = sort_modules_hierarchically(modules)
407    
408    for module in sorted_modules:
409        module_md = generate_markdown(module["module_info"], module["signatures"])
410        md_lines.append(module_md)
411        md_lines.append("\n---\n")
412    
413    return "\n".join(md_lines)
414
415
416def main():
417    parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
418    parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
419    parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
420    parser.add_argument('--package', '-p', help='Focus on a specific package/library')
421    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
422    args = parser.parse_args()
423    
424    html_dir = Path(args.html_dir)
425    
426    if not html_dir.exists() or not html_dir.is_dir():
427        print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
428        sys.exit(1)
429    
430    # Find all JSON files
431    json_files = []
432    for root, _, files in os.walk(html_dir):
433        for file in files:
434            if file.endswith('.html.json'):
435                json_files.append(os.path.join(root, file))
436    
437    if args.verbose:
438        print(f"Found {len(json_files)} JSON files", file=sys.stderr)
439    
440    # Build module hierarchy
441    hierarchy = build_module_hierarchy(json_files, html_dir)
442    
443    if args.verbose:
444        print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
445        for lib, modules in hierarchy.items():
446            print(f"  - {lib}: {len(modules)} modules", file=sys.stderr)
447    
448    # Generate markdown for all or specific package
449    if args.package and args.package in hierarchy:
450        markdown = generate_markdown_library(args.package, hierarchy[args.package])
451    else:
452        # Combine all packages
453        markdown_parts = []
454        for lib_name, modules in sorted(hierarchy.items()):
455            if args.verbose:
456                print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
457            lib_md = generate_markdown_library(lib_name, modules)
458            markdown_parts.append(lib_md)
459            markdown_parts.append("\n\n")
460        
461        markdown = "\n".join(markdown_parts)
462    
463    # Write markdown to output file
464    with open(args.output, 'w', encoding='utf-8') as f:
465        f.write(markdown)
466    
467    print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
468
469
470if __name__ == "__main__":
471    main()