Take OCaml odoc output into MCP

fix paths

Changed files
+303 -46
+176 -19
odoc2json.py
···
import os
import json
import re
+
import time
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
import argparse
from pathlib import Path
-
def extract_package_info(path: str) -> Tuple[str, str]:
+
def extract_package_info(path: str, mode: str = 'full',
+
override_package_name: Optional[str] = None,
+
override_package_version: Optional[str] = None) -> Tuple[str, str]:
"""
Extract package name and version from the path.
Args:
path: Path to the odoc output directory
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
+
override_package_name: Optional override for package name
+
override_package_version: Optional override for package version
Returns:
Tuple of (package_name, package_version)
"""
+
# Always prioritize explicit overrides if provided
+
if override_package_name:
+
package_name = override_package_name
+
else:
+
package_name = "unknown"
+
+
if override_package_version:
+
package_version = override_package_version
+
else:
+
package_version = "unknown"
+
+
# If we have both overrides, no need to analyze path
+
if override_package_name and override_package_version:
+
return package_name, package_version
+
# Use Path for more reliable path parsing
p = Path(path).resolve()
parts = list(p.parts)
-
# If the path is in the format ".../package_name/version/..."
-
if len(parts) >= 2:
-
# The package name is typically the second-to-last component
-
# The version is typically the last component
-
return parts[-2], parts[-1]
-
elif len(parts) == 1:
-
# If only one component, assume it's the package name
-
return parts[0], "unknown"
-
else:
-
return "unknown", "unknown"
+
if mode == 'single':
+
# In single package mode, the package name is typically the directory name
+
if not override_package_name and parts:
+
# Extract package name from the last part of the path
+
package_name = parts[-1]
+
+
# Check if there's a subdirectory in the path that seems like a package name
+
subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)
+
if subdir:
+
package_name = subdir
+
+
elif mode == 'full':
+
# In full mode, we need to look at the directory structure more carefully
+
# For test/ directory, the structure is test/package-name/package-version/
+
+
# First, check if the directory structure matches the expected pattern
+
# Look for subdirectories in the current path
+
try:
+
subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
+
+
# If we have subdirectories that might be package names
+
if subdirs and not override_package_name:
+
# For each subdirectory (potential package name), check if it contains version subdirectories
+
for subdir in subdirs:
+
version_dirs = [d for d in os.listdir(os.path.join(path, subdir))
+
if os.path.isdir(os.path.join(path, subdir, d))]
+
+
# If this subdirectory contains potential version directories, it's likely a package
+
if version_dirs:
+
# We'll use the current file's path to determine which package and version it belongs to
+
# We're processing files at the specific file level elsewhere, so here we just return
+
# default values which will be overridden during actual file processing
+
return subdir, "unknown"
+
+
# If we found no package structure or we're processing a file already in a package context
+
# In this case, we'll determine package/version from the path of the file being processed
+
if len(parts) >= 3:
+
# Path structure might be test/package-name/version/...
+
# Check if the first part is "test"
+
if parts[-3] == "test" or "test" in str(p):
+
package_name = parts[-2] if not override_package_name else package_name
+
package_version = parts[-1] if not override_package_version else package_version
+
else:
+
# Standard structure: .../package-name/package-version/...
+
package_name = parts[-2] if not override_package_name else package_name
+
package_version = parts[-1] if not override_package_version else package_version
+
except (FileNotFoundError, PermissionError) as e:
+
# Handle cases where we can't access the directory
+
print(f"Error accessing directory {path}: {str(e)}")
+
+
return package_name, package_version
def parse_html_content(content: str) -> List[Dict[str, Any]]:
···
Returns:
List of dictionaries containing extracted information
"""
-
with open(file_path, 'r', encoding='utf-8') as f:
+
# Extract package and version from file path if not already properly set
+
if package_version == "unknown" or package_name == "unknown":
+
# Check if this file is in a test directory structure
+
file_path_parts = Path(file_path).resolve().parts
+
+
# Look for test/package-name/version pattern in the path
+
for i, part in enumerate(file_path_parts):
+
if part == "test" and i + 2 < len(file_path_parts):
+
# We found a test directory, extract package name and version
+
package_name = file_path_parts[i + 1]
+
package_version = file_path_parts[i + 2]
+
break
+
+
try:
+
with open(file_path, 'r', encoding='utf-8') as f:
+
try:
+
data = json.load(f)
+
except json.JSONDecodeError:
+
print(f"Error decoding JSON from {file_path}")
+
return []
+
except UnicodeDecodeError:
+
# Try opening with a different encoding or with errors='ignore'
try:
-
data = json.load(f)
-
except json.JSONDecodeError:
-
print(f"Error decoding JSON from {file_path}")
+
with open(file_path, 'r', encoding='latin-1') as f:
+
try:
+
data = json.load(f)
+
except json.JSONDecodeError:
+
print(f"Error decoding JSON from {file_path} with latin-1 encoding")
+
return []
+
except Exception as e:
+
print(f"Error reading {file_path}: {str(e)}")
return []
if 'content' not in data:
···
return items
-
def process_directory(directory: str) -> List[Dict[str, Any]]:
+
def process_directory(directory: str, mode: str = 'full',
+
override_package_name: Optional[str] = None,
+
override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Process all JSON files in a directory recursively.
Args:
directory: Path to the directory containing odoc JSON files
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
+
override_package_name: Optional override for package name
+
override_package_version: Optional override for package version
Returns:
List of all extracted items from all files
"""
all_items = []
-
package_name, package_version = extract_package_info(directory)
+
package_name, package_version = extract_package_info(
+
directory,
+
mode=mode,
+
override_package_name=override_package_name,
+
override_package_version=override_package_version
+
)
+
+
# First count total files to process for progress tracking
+
total_files = 0
+
for root, _, files in os.walk(directory):
+
for file in files:
+
if file.endswith('.html.json'):
+
total_files += 1
+
+
if total_files == 0:
+
print(f"No .html.json files found in {directory}")
+
return all_items
+
+
mode_str = f"single package mode" if mode == 'single' else "full packages mode"
+
print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
+
+
# Process each file with progress indicator
+
processed_files = 0
+
extracted_items = 0
for root, _, files in os.walk(directory):
for file in files:
···
file_path = os.path.join(root, file)
items = process_json_file(file_path, package_name, package_version)
all_items.extend(items)
+
+
# Update progress
+
processed_files += 1
+
extracted_items += len(items)
+
+
# Print progress every 100 files or on the last file
+
if processed_files % 100 == 0 or processed_files == total_files:
+
percent = (processed_files / total_files) * 100
+
print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted",
+
end="\r", flush=True)
+
print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")
return all_items
def main():
+
"""
+
Main entry point for the script.
+
+
Usage examples:
+
+
# Process in full mode (multiple packages)
+
python odoc2json.py /path/to/odoc/output output.json
+
+
# Process a single package with automatic detection
+
python odoc2json.py /path/to/odoc/package output.json --mode single
+
+
# Process with explicit package name and version
+
python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
+
"""
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
parser.add_argument('output_file', help='Output JSON file path')
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
+
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
+
parser.add_argument('--mode', choices=['full', 'single'], default='full',
+
help='Run mode: "full" for complete list of packages, "single" for a single package')
+
parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
+
parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
args = parser.parse_args()
+
+
start_time = time.time()
+
print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
# Process all files in the directory
-
items = process_directory(args.input_dir)
+
items = process_directory(
+
args.input_dir,
+
mode=args.mode,
+
override_package_name=args.package_name,
+
override_package_version=args.package_version
+
)
# Write the output
+
print(f"Writing {len(items)} items to {args.output_file}...")
with open(args.output_file, 'w', encoding='utf-8') as f:
if args.pretty:
json.dump(items, f, indent=2, ensure_ascii=False)
else:
json.dump(items, f, ensure_ascii=False)
-
print(f"Processed {len(items)} items and saved to {args.output_file}")
+
elapsed_time = time.time() - start_time
+
print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")
+
print(f"Output saved to {args.output_file}")
if __name__ == "__main__":
+127 -27
odoc2llm.py
···
def extract_module_info(json_content):
"""Extract module information from odoc JSON content."""
-
data = json.loads(json_content)
+
try:
+
data = json.loads(json_content)
+
except json.JSONDecodeError as e:
+
print(f"JSON decode error: {e}")
+
# Return a minimal structure that won't cause errors downstream
+
return {
+
"name": "Unknown",
+
"type": "Module",
+
"breadcrumbs": [],
+
"content": BeautifulSoup("", "html.parser"),
+
"preamble": ""
+
}
# Extract module name and type from header
header = data.get("header", "")
···
return "\n".join(md_lines)
+
def read_json_file(file_path):
+
"""
+
Read a JSON file with robust error handling for encoding issues.
+
+
Args:
+
file_path: Path to the JSON file
+
+
Returns:
+
Content of the JSON file as a string, or None if there was an error
+
"""
+
# Try UTF-8 first (most common encoding)
+
try:
+
with open(file_path, 'r', encoding='utf-8') as f:
+
return f.read()
+
except UnicodeDecodeError:
+
# Try other encodings if UTF-8 fails
+
try:
+
with open(file_path, 'r', encoding='latin-1') as f:
+
return f.read()
+
except Exception as e:
+
print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)
+
return None
+
+
def build_module_hierarchy(json_files, root_dir):
"""Build a hierarchical structure from all the JSON files."""
hierarchy = defaultdict(list)
···
if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
# For index.html.json, check if it's a module documentation
if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
-
try:
-
with open(json_file, 'r', encoding='utf-8') as f:
-
json_content = f.read()
-
-
# Try to parse the module info
-
module_info = extract_module_info(json_content)
-
signatures = parse_module_signature(module_info["content"])
-
-
# Group by package/library
-
if len(package_parts) > 1:
-
package_name = package_parts[0]
-
hierarchy[package_name].append({
+
json_content = read_json_file(json_file)
+
if json_content:
+
try:
+
# Try to parse the module info
+
module_info = extract_module_info(json_content)
+
signatures = parse_module_signature(module_info["content"])
+
+
# Determine package name and version from path
+
package_name, package_version = determine_package_info(json_file, package_parts, module_info)
+
+
# Use package name and version for the hierarchy key
+
package_key = f"{package_name}"
+
if package_version != "unknown":
+
# Add version information to module_info for display in markdown
+
module_info["package_version"] = package_version
+
+
hierarchy[package_key].append({
"file": json_file,
"module_info": module_info,
"signatures": signatures,
"path_parts": package_parts
})
-
except Exception as e:
-
print(f"Error processing {json_file}: {e}", file=sys.stderr)
+
except Exception as e:
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
continue
# Try to parse other JSON files (non-index.html.json)
-
try:
-
with open(json_file, 'r', encoding='utf-8') as f:
-
json_content = f.read()
-
-
module_info = extract_module_info(json_content)
-
signatures = parse_module_signature(module_info["content"])
-
-
# Group by package/library
-
if len(package_parts) > 1:
-
package_name = package_parts[0]
+
json_content = read_json_file(json_file)
+
if json_content:
+
try:
+
module_info = extract_module_info(json_content)
+
signatures = parse_module_signature(module_info["content"])
+
+
# Determine package name from path
+
package_name = determine_package_name(package_parts, module_info)
+
hierarchy[package_name].append({
"file": json_file,
"module_info": module_info,
"signatures": signatures,
"path_parts": package_parts
})
-
except Exception as e:
-
print(f"Error processing {json_file}: {e}", file=sys.stderr)
+
except Exception as e:
+
print(f"Error processing {json_file}: {e}", file=sys.stderr)
return hierarchy
+
def determine_package_info(file_path, path_parts, module_info):
+
"""
+
Determine package name and version from file path and module info.
+
+
Args:
+
file_path: The full file path
+
path_parts: Parts of the path
+
module_info: Extracted module information
+
+
Returns:
+
Tuple of (package_name, package_version)
+
"""
+
package_name = "unknown"
+
package_version = "unknown"
+
+
# Try to extract from breadcrumbs if available
+
if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):
+
for crumb in module_info["breadcrumbs"]:
+
if "Library" in crumb:
+
# Extract library name from the breadcrumb
+
match = re.search(r'Library\s+(.+)', crumb)
+
if match:
+
package_name = match.group(1).strip()
+
+
# Look for test/package-name/version pattern in the path
+
file_path_parts = Path(file_path).resolve().parts
+
for i, part in enumerate(file_path_parts):
+
if part == "test" and i + 2 < len(file_path_parts):
+
# We found a test directory, extract package name and version
+
package_name = file_path_parts[i + 1]
+
package_version = file_path_parts[i + 2]
+
break
+
+
# If still unknown, fall back to using the first part of the path
+
if package_name == "unknown" and len(path_parts) > 0:
+
package_name = path_parts[0]
+
+
# Last resort - use module name or "unknown"
+
if package_name == "unknown":
+
package_name = module_info["name"] if module_info["name"] else "unknown"
+
+
return package_name, package_version
+
+
def sort_modules_hierarchically(modules):
"""Sort modules to ensure proper hierarchical presentation."""
# First sort by breadcrumb length (shorter = higher in hierarchy)
···
def main():
+
"""
+
Main entry point for the script.
+
+
Usage examples:
+
+
# Process all packages in a directory
+
python odoc2llm.py /path/to/odoc/output
+
+
# Process all packages and specify output file
+
python odoc2llm.py /path/to/odoc/output --output documentation.md
+
+
# Process a specific package only
+
python odoc2llm.py /path/to/odoc/output --package package-name
+
+
# Enable verbose output
+
python odoc2llm.py /path/to/odoc/output --verbose
+
"""
parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')