commit 4177be63b2e3505e8ac21849f8d2cbdf21aa71a5 · anil.recoil.org/odoc-mcp

+240
odoc2json.py
···

       1
       1
       +
       #!/usr/bin/env python3

     

       2
       2
       +
       # /// script

     

       3
       3
       +
       # requires-python = ">=3.11"

     

       4
       4
       +
       # dependencies = [

     

       5
       5
       +
       #   "bs4",

     

       6
       6
       +
       # ]

     

       7
       7
       +
       # ///

     

       8
       8
       +
       """

     

       9
       9
       +
       odoc2json.py - Convert odoc JSON output to structured JSON records

     

       10
       10
       +
       

     

       11
       11
       +
       This script parses the JSON output files from odoc-driver (an OCaml documentation

     

       12
       12
       +
       generator) and converts them into structured JSON records that include package name,

     

       13
       13
       +
       version, and each function signature with associated documentation.

     

       14
       14
       +
       

     

       15
       15
       +
       The output is intended for further processing, analysis, and search over OCaml type 

     

       16
       16
       +
       signatures, especially for loading into columnar formats like Parquet.

     

       17
       17
       +
       """

     

       18
       18
       +
       

     

       19
       19
       +
       import os

     

       20
       20
       +
       import json

     

       21
       21
       +
       import re

     

       22
       22
       +
       from bs4 import BeautifulSoup

     

       23
       23
       +
       from typing import Dict, List, Any, Optional, Tuple

     

       24
       24
       +
       import argparse

     

       25
       25
       +
       from pathlib import Path

     

       26
       26
       +
       

     

       27
       27
       +
       

     

       28
       28
       +
       def extract_package_info(path: str) -> Tuple[str, str]:

     

       29
       29
       +
           """

     

       30
       30
       +
           Extract package name and version from the path.

     

       31
       31
       +
           

     

       32
       32
       +
           Args:

     

       33
       33
       +
               path: Path to the odoc output directory

     

       34
       34
       +
               

     

       35
       35
       +
           Returns:

     

       36
       36
       +
               Tuple of (package_name, package_version)

     

       37
       37
       +
           """

     

       38
       38
       +
           # Use Path for more reliable path parsing

     

       39
       39
       +
           p = Path(path).resolve()

     

       40
       40
       +
           parts = list(p.parts)

     

       41
       41
       +
           

     

       42
       42
       +
           # If the path is in the format ".../package_name/version/..."

     

       43
       43
       +
           if len(parts) >= 2:

     

       44
       44
       +
               # The package name is typically the second-to-last component

     

       45
       45
       +
               # The version is typically the last component

     

       46
       46
       +
               return parts[-2], parts[-1]

     

       47
       47
       +
           elif len(parts) == 1:

     

       48
       48
       +
               # If only one component, assume it's the package name

     

       49
       49
       +
               return parts[0], "unknown"

     

       50
       50
       +
           else:

     

       51
       51
       +
               return "unknown", "unknown"

     

       52
       52
       +
       

     

       53
       53
       +
       

     

       54
       54
       +
       def parse_html_content(content: str) -> List[Dict[str, Any]]:

     

       55
       55
       +
           """

     

       56
       56
       +
           Parse the HTML content from the odoc JSON to extract signatures and documentation.

     

       57
       57
       +
           

     

       58
       58
       +
           Args:

     

       59
       59
       +
               content: HTML content from the odoc JSON file

     

       60
       60
       +
               

     

       61
       61
       +
           Returns:

     

       62
       62
       +
               List of dictionaries containing extracted information

     

       63
       63
       +
           """

     

       64
       64
       +
           soup = BeautifulSoup(content, 'html.parser')

     

       65
       65
       +
           result = []

     

       66
       66
       +
           

     

       67
       67
       +
           # Process each specification block (function, type, module, etc.)

     

       68
       68
       +
           for spec in soup.find_all(class_="odoc-spec"):

     

       69
       69
       +
               item = {}

     

       70
       70
       +
               

     

       71
       71
       +
               # Get the spec element (contains the signature)

     

       72
       72
       +
               spec_elem = spec.find(class_="spec")

     

       73
       73
       +
               if not spec_elem:

     

       74
       74
       +
                   continue

     

       75
       75
       +
                   

     

       76
       76
       +
               # Determine the kind of element

     

       77
       77
       +
               kind = None

     

       78
       78
       +
               for cls in spec_elem.get('class', []):

     

       79
       79
       +
                   if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:

     

       80
       80
       +
                       kind = cls

     

       81
       81
       +
                       break

     

       82
       82
       +
                       

     

       83
       83
       +
               if not kind:

     

       84
       84
       +
                   continue

     

       85
       85
       +
                   

     

       86
       86
       +
               item['kind'] = kind

     

       87
       87
       +
               

     

       88
       88
       +
               # Extract the signature first to use for name extraction if needed

     

       89
       89
       +
               code_elem = spec_elem.find('code')

     

       90
       90
       +
               signature = ""

     

       91
       91
       +
               if code_elem:

     

       92
       92
       +
                   # Get the full signature text and strip all newlines and normalize whitespace

     

       93
       93
       +
                   signature = code_elem.get_text()

     

       94
       94
       +
               

     

       95
       95
       +
               # Extract the name

     

       96
       96
       +
               name = None

     

       97
       97
       +
               

     

       98
       98
       +
               # First try to get name from anchor ID

     

       99
       99
       +
               anchor = spec_elem.find('a', class_="anchor")

     

       100
       100
       +
               if anchor and anchor.get('id'):

     

       101
       101
       +
                   item_id = anchor.get('id')

     

       102
       102
       +
                   # Clean up the ID to get the name

     

       103
       103
       +
                   name = item_id.split('.')[-1] if '.' in item_id else item_id

     

       104
       104
       +
                   # Remove prefixes like 'type-', 'val-', etc.

     

       105
       105
       +
                   name = re.sub(r'^(type|val|module|class|exception)-', '', name)

     

       106
       106
       +
               

     

       107
       107
       +
               # For values (functions), extract the name from signature as a fallback

     

       108
       108
       +
               # This handles cases where the anchor doesn't contain the function name

     

       109
       109
       +
               if kind == 'value' and not name and signature:

     

       110
       110
       +
                   # Look for "val name :" pattern in the signature

     

       111
       111
       +
                   val_match = re.search(r'val\s+(\w+)\s*:', signature)

     

       112
       112
       +
                   if val_match:

     

       113
       113
       +
                       name = val_match.group(1)

     

       114
       114
       +
               

     

       115
       115
       +
               if name:

     

       116
       116
       +
                   item['name'] = name

     

       117
       117
       +
               

     

       118
       118
       +
               # Add the processed signature

     

       119
       119
       +
               if signature:

     

       120
       120
       +
                   # Replace newlines and multiple whitespace with a single space

     

       121
       121
       +
                   signature = re.sub(r'\s+', ' ', signature)

     

       122
       122
       +
                   item['signature'] = signature.strip()

     

       123
       123
       +
                   

     

       124
       124
       +
               # Extract documentation

     

       125
       125
       +
               doc_elem = spec.find(class_="spec-doc")

     

       126
       126
       +
               if doc_elem:

     

       127
       127
       +
                   # Get the raw HTML content and remove all HTML tags

     

       128
       128
       +
                   html_content = str(doc_elem)

     

       129
       129
       +
                   # First, convert <br> tags to spaces

     

       130
       130
       +
                   html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)

     

       131
       131
       +
                   # Parse the modified HTML

     

       132
       132
       +
                   soup_doc = BeautifulSoup(html_content, 'html.parser')

     

       133
       133
       +
                   # Get text with all whitespace normalized

     

       134
       134
       +
                   doc = soup_doc.get_text()

     

       135
       135
       +
                   # Replace all newlines and multiple spaces with a single space

     

       136
       136
       +
                   doc = re.sub(r'\s+', ' ', doc)

     

       137
       137
       +
                   item['documentation'] = doc.strip()

     

       138
       138
       +
                   

     

       139
       139
       +
               # Add the item to our results

     

       140
       140
       +
               result.append(item)

     

       141
       141
       +
               

     

       142
       142
       +
           return result

     

       143
       143
       +
       

     

       144
       144
       +
       

     

       145
       145
       +
       def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:

     

       146
       146
       +
           """

     

       147
       147
       +
           Process a single odoc JSON file and extract the relevant information.

     

       148
       148
       +
           

     

       149
       149
       +
           Args:

     

       150
       150
       +
               file_path: Path to the JSON file

     

       151
       151
       +
               package_name: Name of the package

     

       152
       152
       +
               package_version: Version of the package

     

       153
       153
       +
               

     

       154
       154
       +
           Returns:

     

       155
       155
       +
               List of dictionaries containing extracted information

     

       156
       156
       +
           """

     

       157
       157
       +
           with open(file_path, 'r', encoding='utf-8') as f:

     

       158
       158
       +
               try:

     

       159
       159
       +
                   data = json.load(f)

     

       160
       160
       +
               except json.JSONDecodeError:

     

       161
       161
       +
                   print(f"Error decoding JSON from {file_path}")

     

       162
       162
       +
                   return []

     

       163
       163
       +
           

     

       164
       164
       +
           if 'content' not in data:

     

       165
       165
       +
               return []

     

       166
       166
       +
       

     

       167
       167
       +
           # Extract module path from breadcrumbs

     

       168
       168
       +
           module_path = []

     

       169
       169
       +
           if 'breadcrumbs' in data:

     

       170
       170
       +
               for crumb in data['breadcrumbs']:

     

       171
       171
       +
                   if crumb.get('kind') == 'module':

     

       172
       172
       +
                       module_path.append(crumb.get('name'))

     

       173
       173
       +
           

     

       174
       174
       +
           module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))

     

       175
       175
       +
           

     

       176
       176
       +
           # Extract items from the content

     

       177
       177
       +
           items = parse_html_content(data['content'])

     

       178
       178
       +
           

     

       179
       179
       +
           # Add package and module information to each item

     

       180
       180
       +
           for item in items:

     

       181
       181
       +
               item['package_name'] = package_name

     

       182
       182
       +
               item['package_version'] = package_version

     

       183
       183
       +
               item['module_name'] = module_name

     

       184
       184
       +
               

     

       185
       185
       +
               # Create a full path for the item that includes the item name

     

       186
       186
       +
               # - module_name: just the module hierarchy (e.g., "Math.Operations")

     

       187
       187
       +
               # - full_path: complete path including item name (e.g., "Math.Operations.add")

     

       188
       188
       +
               if 'name' in item:

     

       189
       189
       +
                   item['full_path'] = f"{module_name}.{item['name']}"

     

       190
       190
       +
               else:

     

       191
       191
       +
                   item['full_path'] = module_name

     

       192
       192
       +
       

     

       193
       193
       +
           return items

     

       194
       194
       +
       

     

       195
       195
       +
       

     

       196
       196
       +
       def process_directory(directory: str) -> List[Dict[str, Any]]:

     

       197
       197
       +
           """

     

       198
       198
       +
           Process all JSON files in a directory recursively.

     

       199
       199
       +
           

     

       200
       200
       +
           Args:

     

       201
       201
       +
               directory: Path to the directory containing odoc JSON files

     

       202
       202
       +
               

     

       203
       203
       +
           Returns:

     

       204
       204
       +
               List of all extracted items from all files

     

       205
       205
       +
           """

     

       206
       206
       +
           all_items = []

     

       207
       207
       +
           package_name, package_version = extract_package_info(directory)

     

       208
       208
       +
           

     

       209
       209
       +
           for root, _, files in os.walk(directory):

     

       210
       210
       +
               for file in files:

     

       211
       211
       +
                   if file.endswith('.html.json'):

     

       212
       212
       +
                       file_path = os.path.join(root, file)

     

       213
       213
       +
                       items = process_json_file(file_path, package_name, package_version)

     

       214
       214
       +
                       all_items.extend(items)

     

       215
       215
       +
           

     

       216
       216
       +
           return all_items

     

       217
       217
       +
       

     

       218
       218
       +
       

     

       219
       219
       +
       def main():

     

       220
       220
       +
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       221
       221
       +
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     

       222
       222
       +
           parser.add_argument('output_file', help='Output JSON file path')

     

       223
       223
       +
           parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')

     

       224
       224
       +
           args = parser.parse_args()

     

       225
       225
       +
           

     

       226
       226
       +
           # Process all files in the directory

     

       227
       227
       +
           items = process_directory(args.input_dir)

     

       228
       228
       +
           

     

       229
       229
       +
           # Write the output

     

       230
       230
       +
           with open(args.output_file, 'w', encoding='utf-8') as f:

     

       231
       231
       +
               if args.pretty:

     

       232
       232
       +
                   json.dump(items, f, indent=2, ensure_ascii=False)

     

       233
       233
       +
               else:

     

       234
       234
       +
                   json.dump(items, f, ensure_ascii=False)

     

       235
       235
       +
           

     

       236
       236
       +
           print(f"Processed {len(items)} items and saved to {args.output_file}")

     

       237
       237
       +
       

     

       238
       238
       +
       

     

       239
       239
       +
       if __name__ == "__main__":

     

       240
       240
       +
           main()