···
1
+
#!/usr/bin/env python3
3
+
# requires-python = ">=3.11"
9
+
odoc2json.py - Convert odoc JSON output to structured JSON records
11
+
This script parses the JSON output files from odoc-driver (an OCaml documentation
12
+
generator) and converts them into structured JSON records that include package name,
13
+
version, and each function signature with associated documentation.
15
+
The output is intended for further processing, analysis, and search over OCaml type
16
+
signatures, especially for loading into columnar formats like Parquet.
22
+
from bs4 import BeautifulSoup
23
+
from typing import Dict, List, Any, Optional, Tuple
25
+
from pathlib import Path
28
+
def extract_package_info(path: str) -> Tuple[str, str]:
30
+
Extract package name and version from the path.
33
+
path: Path to the odoc output directory
36
+
Tuple of (package_name, package_version)
38
+
# Use Path for more reliable path parsing
39
+
p = Path(path).resolve()
40
+
parts = list(p.parts)
42
+
# If the path is in the format ".../package_name/version/..."
44
+
# The package name is typically the second-to-last component
45
+
# The version is typically the last component
46
+
return parts[-2], parts[-1]
47
+
elif len(parts) == 1:
48
+
# If only one component, assume it's the package name
49
+
return parts[0], "unknown"
51
+
return "unknown", "unknown"
54
+
def parse_html_content(content: str) -> List[Dict[str, Any]]:
56
+
Parse the HTML content from the odoc JSON to extract signatures and documentation.
59
+
content: HTML content from the odoc JSON file
62
+
List of dictionaries containing extracted information
64
+
soup = BeautifulSoup(content, 'html.parser')
67
+
# Process each specification block (function, type, module, etc.)
68
+
for spec in soup.find_all(class_="odoc-spec"):
71
+
# Get the spec element (contains the signature)
72
+
spec_elem = spec.find(class_="spec")
76
+
# Determine the kind of element
78
+
for cls in spec_elem.get('class', []):
79
+
if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
88
+
# Extract the signature first to use for name extraction if needed
89
+
code_elem = spec_elem.find('code')
92
+
# Get the full signature text and strip all newlines and normalize whitespace
93
+
signature = code_elem.get_text()
98
+
# First try to get name from anchor ID
99
+
anchor = spec_elem.find('a', class_="anchor")
100
+
if anchor and anchor.get('id'):
101
+
item_id = anchor.get('id')
102
+
# Clean up the ID to get the name
103
+
name = item_id.split('.')[-1] if '.' in item_id else item_id
104
+
# Remove prefixes like 'type-', 'val-', etc.
105
+
name = re.sub(r'^(type|val|module|class|exception)-', '', name)
107
+
# For values (functions), extract the name from signature as a fallback
108
+
# This handles cases where the anchor doesn't contain the function name
109
+
if kind == 'value' and not name and signature:
110
+
# Look for "val name :" pattern in the signature
111
+
val_match = re.search(r'val\s+(\w+)\s*:', signature)
113
+
name = val_match.group(1)
116
+
item['name'] = name
118
+
# Add the processed signature
120
+
# Replace newlines and multiple whitespace with a single space
121
+
signature = re.sub(r'\s+', ' ', signature)
122
+
item['signature'] = signature.strip()
124
+
# Extract documentation
125
+
doc_elem = spec.find(class_="spec-doc")
127
+
# Get the raw HTML content and remove all HTML tags
128
+
html_content = str(doc_elem)
129
+
# First, convert <br> tags to spaces
130
+
html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
131
+
# Parse the modified HTML
132
+
soup_doc = BeautifulSoup(html_content, 'html.parser')
133
+
# Get text with all whitespace normalized
134
+
doc = soup_doc.get_text()
135
+
# Replace all newlines and multiple spaces with a single space
136
+
doc = re.sub(r'\s+', ' ', doc)
137
+
item['documentation'] = doc.strip()
139
+
# Add the item to our results
140
+
result.append(item)
145
+
def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
147
+
Process a single odoc JSON file and extract the relevant information.
150
+
file_path: Path to the JSON file
151
+
package_name: Name of the package
152
+
package_version: Version of the package
155
+
List of dictionaries containing extracted information
157
+
with open(file_path, 'r', encoding='utf-8') as f:
159
+
data = json.load(f)
160
+
except json.JSONDecodeError:
161
+
print(f"Error decoding JSON from {file_path}")
164
+
if 'content' not in data:
167
+
# Extract module path from breadcrumbs
169
+
if 'breadcrumbs' in data:
170
+
for crumb in data['breadcrumbs']:
171
+
if crumb.get('kind') == 'module':
172
+
module_path.append(crumb.get('name'))
174
+
module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
176
+
# Extract items from the content
177
+
items = parse_html_content(data['content'])
179
+
# Add package and module information to each item
181
+
item['package_name'] = package_name
182
+
item['package_version'] = package_version
183
+
item['module_name'] = module_name
185
+
# Create a full path for the item that includes the item name
186
+
# - module_name: just the module hierarchy (e.g., "Math.Operations")
187
+
# - full_path: complete path including item name (e.g., "Math.Operations.add")
189
+
item['full_path'] = f"{module_name}.{item['name']}"
191
+
item['full_path'] = module_name
196
+
def process_directory(directory: str) -> List[Dict[str, Any]]:
198
+
Process all JSON files in a directory recursively.
201
+
directory: Path to the directory containing odoc JSON files
204
+
List of all extracted items from all files
207
+
package_name, package_version = extract_package_info(directory)
209
+
for root, _, files in os.walk(directory):
211
+
if file.endswith('.html.json'):
212
+
file_path = os.path.join(root, file)
213
+
items = process_json_file(file_path, package_name, package_version)
214
+
all_items.extend(items)
220
+
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
221
+
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
222
+
parser.add_argument('output_file', help='Output JSON file path')
223
+
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
224
+
args = parser.parse_args()
226
+
# Process all files in the directory
227
+
items = process_directory(args.input_dir)
230
+
with open(args.output_file, 'w', encoding='utf-8') as f:
232
+
json.dump(items, f, indent=2, ensure_ascii=False)
234
+
json.dump(items, f, ensure_ascii=False)
236
+
print(f"Processed {len(items)} items and saved to {args.output_file}")
239
+
if __name__ == "__main__":