odoc2llm.py at main · anil.recoil.org/odoc-mcp

anil.recoil.org / odoc-mcp
Take OCaml odoc output into MCP
odoc-mcp / odoc2llm.py
at main 21 kB view raw
  1#!/usr/bin/env python3
  2# /// script
  3# requires-python = ">=3.11"
  4# dependencies = [
  5#   "bs4",
  6# ]
  7# ///
  8"""
  9odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
 10
 11This script processes JSON files generated by odoc-driver (OCaml documentation generator)
 12and produces a single Markdown file with the essential module structure and signatures
 13formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
 14"""
 15
 16import os
 17import sys
 18import json
 19import re
 20from bs4 import BeautifulSoup
 21from collections import defaultdict
 22import argparse
 23from pathlib import Path
 24import html
 25
 26
 27def extract_module_info(json_content):
 28    """Extract module information from odoc JSON content."""
 29    try:
 30        data = json.loads(json_content)
 31    except json.JSONDecodeError as e:
 32        print(f"JSON decode error: {e}")
 33        # Return a minimal structure that won't cause errors downstream
 34        return {
 35            "name": "Unknown",
 36            "type": "Module",
 37            "breadcrumbs": [],
 38            "content": BeautifulSoup("", "html.parser"),
 39            "preamble": ""
 40        }
 41    
 42    # Extract module name and type from header
 43    header = data.get("header", "")
 44    soup = BeautifulSoup(header, "html.parser")
 45    header_text = soup.get_text().strip()
 46    
 47    # Determine module type and name
 48    module_type = "Module"
 49    if "Module type" in header_text:
 50        module_type = "Module type"
 51    elif "Class" in header_text:
 52        module_type = "Class"
 53    
 54    # Extract the actual module name
 55    module_name = ""
 56    code_tag = soup.find("code")
 57    if code_tag:
 58        module_name = code_tag.get_text().strip()
 59    else:
 60        # Fall back to header text with type prefix removed
 61        module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
 62    
 63    # Extract breadcrumbs for context
 64    breadcrumbs = []
 65    for crumb in data.get("breadcrumbs", []):
 66        name = crumb.get("name", "")
 67        if name:
 68            soup = BeautifulSoup(name, "html.parser")
 69            clean_name = soup.get_text().strip()
 70            # Clean up the breadcrumb text
 71            clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
 72            breadcrumbs.append(clean_name)
 73    
 74    # Extract module content
 75    content = data.get("content", "")
 76    soup = BeautifulSoup(content, "html.parser")
 77    
 78    return {
 79        "name": module_name,
 80        "type": module_type,
 81        "breadcrumbs": breadcrumbs,
 82        "content": soup,
 83        "preamble": data.get("preamble", "")
 84    }
 85
 86
 87def clean_signature_text(text):
 88    """Clean up signature text for better readability."""
 89    # Replace special arrow characters with ->
 90    text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->')
 91    
 92    # Replace multiple spaces with a single space, except in code blocks
 93    text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
 94    
 95    return text
 96
 97
 98def extract_signature_name(sig_content):
 99    """Extract the name of a signature (function name, type name, etc.)."""
100    # For val signatures: extract function name before the first :
101    match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
102    if match:
103        return match.group(1)
104    
105    # For type signatures: extract type name
106    match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
107    if match:
108        return match.group(1)
109    
110    # For module signatures: extract module name
111    match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
112    if match:
113        return match.group(1)
114    
115    # For class signatures: extract class name
116    match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
117    if match:
118        return match.group(1)
119    
120    # For exception signatures: extract exception name
121    match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
122    if match:
123        return match.group(1)
124    
125    return None
126
127
128def parse_module_signature(content_soup):
129    """Parse the OCaml module signature from the HTML content."""
130    signatures = []
131    
132    # Get all the odoc-spec divs
133    spec_divs = content_soup.find_all("div", class_="odoc-spec")
134    
135    for spec in spec_divs:
136        sig_id = None
137        sig_type = None
138        sig_content = None
139        doc_content = None
140        
141        # Find the actual signature
142        sig_div = spec.find("div", class_="spec")
143        if sig_div:
144            # Get the ID for cross-referencing
145            sig_id = sig_div.get("id", "")
146            
147            # Determine the type of signature (type, val, module, etc.)
148            sig_type_span = sig_div.find("span", class_="keyword")
149            if sig_type_span:
150                sig_type = sig_type_span.get_text().strip()
151            
152            # Get the full code content
153            code_tag = sig_div.find("code")
154            if code_tag:
155                # Extract the full OCaml signature text properly
156                # We'll convert all spans to plain text while preserving structure
157                for span in code_tag.find_all("span"):
158                    span.replace_with(span.get_text())
159                
160                sig_content = clean_signature_text(code_tag.get_text())
161        
162        # Find documentation for this signature
163        doc_div = spec.find("div", class_="spec-doc")
164        if doc_div:
165            # Process paragraphs and lists for documentation
166            doc_parts = []
167            
168            # Process regular paragraphs
169            for p in doc_div.find_all("p"):
170                # Clean up code references in paragraph
171                for code in p.find_all("code"):
172                    # Convert links within code tags to plain text
173                    for a in code.find_all("a"):
174                        a.replace_with(a.get_text())
175                    # Keep the code tag formatting
176                    code_text = code.get_text()
177                    code.string = code_text
178                
179                # Clean up the paragraph text
180                p_text = clean_signature_text(p.get_text()).strip()
181                if p_text:
182                    doc_parts.append(p_text)
183            
184            # Process bulleted lists
185            for ul in doc_div.find_all("ul"):
186                for li in ul.find_all("li"):
187                    # Check if it's a special tag like @raises, @returns, etc.
188                    tag_span = li.find("span", class_="at-tag")
189                    if tag_span:
190                        tag_name = tag_span.get_text().strip()
191                        # Remove the tag span from consideration
192                        tag_span.extract()
193                        # Get the rest of the content
194                        li_text = clean_signature_text(li.get_text()).strip()
195                        doc_parts.append(f"@{tag_name} {li_text}")
196                    else:
197                        # Regular list item
198                        li_text = clean_signature_text(li.get_text()).strip()
199                        doc_parts.append(f"- {li_text}")
200            
201            # Process code examples
202            for pre in doc_div.find_all("pre"):
203                code = pre.find("code")
204                if code:
205                    # Get the language class if available
206                    lang = "ocaml"  # Default to OCaml
207                    if "language-" in code.get("class", [""]):
208                        for cls in code.get("class", []):
209                            if cls.startswith("language-"):
210                                lang = cls.replace("language-", "")
211                    
212                    # Preserve indentation and line breaks in code blocks
213                    code_text = code.get_text()
214                    doc_parts.append(f"```{lang}\n{code_text}\n```")
215            
216            if doc_parts:
217                doc_content = "\n".join(doc_parts)
218        
219        # Only add signatures that have content
220        if sig_type and sig_content:
221            # Extract the name of the element (function name, type name, etc.)
222            name = extract_signature_name(sig_content)
223            
224            # Build the full signature
225            signature = {
226                "id": sig_id,
227                "type": sig_type,
228                "name": name,
229                "content": sig_content,
230                "doc": doc_content
231            }
232            signatures.append(signature)
233    
234    return signatures
235
236
237def generate_markdown(module_info, signatures):
238    """Generate markdown documentation from parsed module information."""
239    md_lines = []
240    
241    # Module header with breadcrumbs
242    breadcrumb_path = " > ".join(module_info["breadcrumbs"])
243    md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
244    md_lines.append(f"**Path:** {breadcrumb_path}")
245    md_lines.append("")
246    
247    # Add module preamble documentation if available
248    if module_info["preamble"]:
249        preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
250        preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
251        if preamble_text:
252            md_lines.append(preamble_text)
253            md_lines.append("")
254    
255    # Organize signatures by type
256    sig_by_type = defaultdict(list)
257    for sig in signatures:
258        sig_by_type[sig["type"]].append(sig)
259    
260    # Process types first
261    if "type" in sig_by_type:
262        md_lines.append("## Types")
263        for sig in sig_by_type["type"]:
264            md_lines.append("")
265            md_lines.append(f"### `{sig['content']}`")
266            
267            # Add documentation if available
268            if sig["doc"]:
269                md_lines.append("")
270                md_lines.append(sig["doc"])
271        md_lines.append("")
272    
273    # Process exceptions
274    if "exception" in sig_by_type:
275        md_lines.append("## Exceptions")
276        for sig in sig_by_type["exception"]:
277            md_lines.append("")
278            md_lines.append(f"### `{sig['content']}`")
279            
280            # Add documentation if available
281            if sig["doc"]:
282                md_lines.append("")
283                md_lines.append(sig["doc"])
284        md_lines.append("")
285    
286    # Process values (functions)
287    if "val" in sig_by_type:
288        md_lines.append("## Values")
289        for sig in sig_by_type["val"]:
290            md_lines.append("")
291            md_lines.append(f"### `{sig['content']}`")
292            
293            # Add documentation if available
294            if sig["doc"]:
295                md_lines.append("")
296                md_lines.append(sig["doc"])
297        md_lines.append("")
298    
299    # Process modules
300    if "module" in sig_by_type:
301        md_lines.append("## Modules")
302        for sig in sig_by_type["module"]:
303            md_lines.append("")
304            md_lines.append(f"### `{sig['content']}`")
305            
306            # Add documentation if available
307            if sig["doc"]:
308                md_lines.append("")
309                md_lines.append(sig["doc"])
310        md_lines.append("")
311    
312    # Process classes
313    if "class" in sig_by_type:
314        md_lines.append("## Classes")
315        for sig in sig_by_type["class"]:
316            md_lines.append("")
317            md_lines.append(f"### `{sig['content']}`")
318            
319            # Add documentation if available
320            if sig["doc"]:
321                md_lines.append("")
322                md_lines.append(sig["doc"])
323        md_lines.append("")
324    
325    # Process remaining signature types
326    for sig_type, sigs in sig_by_type.items():
327        if sig_type not in ["type", "val", "module", "class", "exception"]:
328            md_lines.append(f"## {sig_type.capitalize()}s")
329            for sig in sigs:
330                md_lines.append("")
331                md_lines.append(f"### `{sig['content']}`")
332                
333                # Add documentation if available
334                if sig["doc"]:
335                    md_lines.append("")
336                    md_lines.append(sig["doc"])
337            md_lines.append("")
338    
339    return "\n".join(md_lines)
340
341
342def read_json_file(file_path):
343    """
344    Read a JSON file with robust error handling for encoding issues.
345    
346    Args:
347        file_path: Path to the JSON file
348        
349    Returns:
350        Content of the JSON file as a string, or None if there was an error
351    """
352    # Try UTF-8 first (most common encoding)
353    try:
354        with open(file_path, 'r', encoding='utf-8') as f:
355            return f.read()
356    except UnicodeDecodeError:
357        # Try other encodings if UTF-8 fails
358        try:
359            with open(file_path, 'r', encoding='latin-1') as f:
360                return f.read()
361        except Exception as e:
362            print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)
363            return None
364
365
366def build_module_hierarchy(json_files, root_dir):
367    """Build a hierarchical structure from all the JSON files."""
368    hierarchy = defaultdict(list)
369    
370    for json_file in json_files:
371        rel_path = os.path.relpath(json_file, root_dir)
372        package_parts = rel_path.split(os.sep)
373        
374        # Skip irrelevant JSON files
375        if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
376            # For index.html.json, check if it's a module documentation
377            if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
378                json_content = read_json_file(json_file)
379                if json_content:
380                    try:
381                        # Try to parse the module info
382                        module_info = extract_module_info(json_content)
383                        signatures = parse_module_signature(module_info["content"])
384                        
385                        # Determine package name and version from path
386                        package_name, package_version = determine_package_info(json_file, package_parts, module_info)
387                        
388                        # Use package name and version for the hierarchy key
389                        package_key = f"{package_name}"
390                        if package_version != "unknown":
391                            # Add version information to module_info for display in markdown
392                            module_info["package_version"] = package_version
393                        
394                        hierarchy[package_key].append({
395                            "file": json_file,
396                            "module_info": module_info,
397                            "signatures": signatures,
398                            "path_parts": package_parts
399                        })
400                    except Exception as e:
401                        print(f"Error processing {json_file}: {e}", file=sys.stderr)
402            
403            continue
404        
405        # Try to parse other JSON files (non-index.html.json)
406        json_content = read_json_file(json_file)
407        if json_content:
408            try:
409                module_info = extract_module_info(json_content)
410                signatures = parse_module_signature(module_info["content"])
411                
412                # Determine package name from path
413                package_name = determine_package_name(package_parts, module_info)
414                
415                hierarchy[package_name].append({
416                    "file": json_file,
417                    "module_info": module_info,
418                    "signatures": signatures,
419                    "path_parts": package_parts
420                })
421            except Exception as e:
422                print(f"Error processing {json_file}: {e}", file=sys.stderr)
423    
424    return hierarchy
425
426
427def determine_package_info(file_path, path_parts, module_info):
428    """
429    Determine package name and version from file path and module info.
430    
431    Args:
432        file_path: The full file path
433        path_parts: Parts of the path
434        module_info: Extracted module information
435        
436    Returns:
437        Tuple of (package_name, package_version)
438    """
439    package_name = "unknown"
440    package_version = "unknown"
441    
442    # Try to extract from breadcrumbs if available
443    if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):
444        for crumb in module_info["breadcrumbs"]:
445            if "Library" in crumb:
446                # Extract library name from the breadcrumb
447                match = re.search(r'Library\s+(.+)', crumb)
448                if match:
449                    package_name = match.group(1).strip()
450    
451    # Look for test/package-name/version pattern in the path
452    file_path_parts = Path(file_path).resolve().parts
453    for i, part in enumerate(file_path_parts):
454        if part == "test" and i + 2 < len(file_path_parts):
455            # We found a test directory, extract package name and version
456            package_name = file_path_parts[i + 1]
457            package_version = file_path_parts[i + 2]
458            break
459    
460    # If still unknown, fall back to using the first part of the path
461    if package_name == "unknown" and len(path_parts) > 0:
462        package_name = path_parts[0]
463    
464    # Last resort - use module name or "unknown"
465    if package_name == "unknown":
466        package_name = module_info["name"] if module_info["name"] else "unknown"
467    
468    return package_name, package_version
469
470
471def sort_modules_hierarchically(modules):
472    """Sort modules to ensure proper hierarchical presentation."""
473    # First sort by breadcrumb length (shorter = higher in hierarchy)
474    # Then sort alphabetically within the same level
475    return sorted(modules, key=lambda x: (
476        len(x["module_info"]["breadcrumbs"]), 
477        x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
478    ))
479
480
481def generate_markdown_library(lib_name, modules):
482    """Generate markdown for a specific library."""
483    md_lines = []
484    
485    md_lines.append(f"# Library: {lib_name}")
486    md_lines.append("")
487    
488    # Sort modules hierarchically
489    sorted_modules = sort_modules_hierarchically(modules)
490    
491    for module in sorted_modules:
492        module_md = generate_markdown(module["module_info"], module["signatures"])
493        md_lines.append(module_md)
494        md_lines.append("\n---\n")
495    
496    return "\n".join(md_lines)
497
498
499def main():
500    """
501    Main entry point for the script.
502    
503    Usage examples:
504    
505    # Process all packages in a directory
506    python odoc2llm.py /path/to/odoc/output
507    
508    # Process all packages and specify output file
509    python odoc2llm.py /path/to/odoc/output --output documentation.md
510    
511    # Process a specific package only
512    python odoc2llm.py /path/to/odoc/output --package package-name
513    
514    # Enable verbose output
515    python odoc2llm.py /path/to/odoc/output --verbose
516    """
517    parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
518    parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
519    parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
520    parser.add_argument('--package', '-p', help='Focus on a specific package/library')
521    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
522    args = parser.parse_args()
523    
524    html_dir = Path(args.html_dir)
525    
526    if not html_dir.exists() or not html_dir.is_dir():
527        print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
528        sys.exit(1)
529    
530    # Find all JSON files
531    json_files = []
532    for root, _, files in os.walk(html_dir):
533        for file in files:
534            if file.endswith('.html.json'):
535                json_files.append(os.path.join(root, file))
536    
537    if args.verbose:
538        print(f"Found {len(json_files)} JSON files", file=sys.stderr)
539    
540    # Build module hierarchy
541    hierarchy = build_module_hierarchy(json_files, html_dir)
542    
543    if args.verbose:
544        print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
545        for lib, modules in hierarchy.items():
546            print(f"  - {lib}: {len(modules)} modules", file=sys.stderr)
547    
548    # Generate markdown for all or specific package
549    if args.package and args.package in hierarchy:
550        markdown = generate_markdown_library(args.package, hierarchy[args.package])
551    else:
552        # Combine all packages
553        markdown_parts = []
554        for lib_name, modules in sorted(hierarchy.items()):
555            if args.verbose:
556                print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
557            lib_md = generate_markdown_library(lib_name, modules)
558            markdown_parts.append(lib_md)
559            markdown_parts.append("\n\n")
560        
561        markdown = "\n".join(markdown_parts)
562    
563    # Write markdown to output file
564    with open(args.output, 'w', encoding='utf-8') as f:
565        f.write(markdown)
566    
567    print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
568
569
570if __name__ == "__main__":
571    main()