comparing 4177be63b2e3505e8ac21849f8d2cbdf21aa71a5 and main on anil.recoil.org/odoc-mcp

+235 -28

odoc2json.py

···

       19
        
       import os

     

       20
        
       import json

     

       21
        
       import re

     

       0
        
       
     

       0
        
       
     

       22
        
       from bs4 import BeautifulSoup

     

       23
        
       from typing import Dict, List, Any, Optional, Tuple

     

       24
        
       import argparse

     

       25
        
       from pathlib import Path

     

       0
        
       
     

       26
        
       

     

       27
        
       

     

       28
       -
       def extract_package_info(path: str) -> Tuple[str, str]:

     

       0
        
       
     

       0
        
       
     

       29
        
           """

     

       30
        
           Extract package name and version from the path.

     

       31
        
           

     

       32
        
           Args:

     

       33
        
               path: Path to the odoc output directory

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       34
        
               

     

       35
        
           Returns:

     

       36
        
               Tuple of (package_name, package_version)

     

       37
        
           """

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       38
        
           # Use Path for more reliable path parsing

     

       39
        
           p = Path(path).resolve()

     

       40
        
           parts = list(p.parts)

     

       41
        
           

     

       42
       -
           # If the path is in the format ".../package_name/version/..."

     

       43
       -
           if len(parts) >= 2:

     

       44
       -
               # The package name is typically the second-to-last component

     

       45
       -
               # The version is typically the last component

     

       46
       -
               return parts[-2], parts[-1]

     

       47
       -
           elif len(parts) == 1:

     

       48
       -
               # If only one component, assume it's the package name

     

       49
       -
               return parts[0], "unknown"

     

       50
       -
           else:

     

       51
       -
               return "unknown", "unknown"

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       52
        
       

     

       53
        
       

     

       54
        
       def parse_html_content(content: str) -> List[Dict[str, Any]]:

     
···

       154
        
           Returns:

     

       155
        
               List of dictionaries containing extracted information

     

       156
        
           """

     

       157
       -
           with open(file_path, 'r', encoding='utf-8') as f:

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       158
        
               try:

     

       159
       -
                   data = json.load(f)

     

       160
       -
               except json.JSONDecodeError:

     

       161
       -
                   print(f"Error decoding JSON from {file_path}")

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       162
        
                   return []

     

       163
        
           

     

       164
        
           if 'content' not in data:

     
···

       193
        
           return items

     

       194
        
       

     

       195
        
       

     

       196
       -
       def process_directory(directory: str) -> List[Dict[str, Any]]:

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       197
        
           """

     

       198
       -
           Process all JSON files in a directory recursively.

     

       199
        
           

     

       200
        
           Args:

     

       201
        
               directory: Path to the directory containing odoc JSON files

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       202
        
               

     

       203
        
           Returns:

     

       204
        
               List of all extracted items from all files

     

       205
        
           """

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       206
        
           all_items = []

     

       207
       -
           package_name, package_version = extract_package_info(directory)

     

       208
        
           

     

       209
       -
           for root, _, files in os.walk(directory):

     

       210
       -
               for file in files:

     

       211
       -
                   if file.endswith('.html.json'):

     

       212
       -
                       file_path = os.path.join(root, file)

     

       213
       -
                       items = process_json_file(file_path, package_name, package_version)

     

       214
       -
                       all_items.extend(items)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       215
        
           

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       216
        
           return all_items

     

       217
        
       

     

       218
        
       

     

       219
        
       def main():

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       220
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       221
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     

       222
        
           parser.add_argument('output_file', help='Output JSON file path')

     

       223
        
           parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       224
        
           args = parser.parse_args()

     

       225
        
           

     

       226
       -
           # Process all files in the directory

     

       227
       -
           items = process_directory(args.input_dir)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       228
        
           

     

       229
        
           # Write the output

     

       0
        
       
     

       230
        
           with open(args.output_file, 'w', encoding='utf-8') as f:

     

       231
        
               if args.pretty:

     

       232
        
                   json.dump(items, f, indent=2, ensure_ascii=False)

     

       233
        
               else:

     

       234
        
                   json.dump(items, f, ensure_ascii=False)

     

       235
        
           

     

       236
       -
           print(f"Processed {len(items)} items and saved to {args.output_file}")

     

       0
        
       
     

       0
        
       
     

       237
        
       

     

       238
        
       

     

       239
        
       if __name__ == "__main__":

     

       240
       -
           main()

···

       19
        
       import os

     

       20
        
       import json

     

       21
        
       import re

     

       22
       +
       import time

     

       23
       +
       import multiprocessing as mp

     

       24
        
       from bs4 import BeautifulSoup

     

       25
        
       from typing import Dict, List, Any, Optional, Tuple

     

       26
        
       import argparse

     

       27
        
       from pathlib import Path

     

       28
       +
       from functools import partial

     

       29
        
       

     

       30
        
       

     

       31
       +
       def extract_package_info(path: str, mode: str = 'full', 

     

       32
       +
                              override_package_name: Optional[str] = None,

     

       33
       +
                              override_package_version: Optional[str] = None) -> Tuple[str, str]:

     

       34
        
           """

     

       35
        
           Extract package name and version from the path.

     

       36
        
           

     

       37
        
           Args:

     

       38
        
               path: Path to the odoc output directory

     

       39
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       40
       +
               override_package_name: Optional override for package name

     

       41
       +
               override_package_version: Optional override for package version

     

       42
        
               

     

       43
        
           Returns:

     

       44
        
               Tuple of (package_name, package_version)

     

       45
        
           """

     

       46
       +
           # Always prioritize explicit overrides if provided

     

       47
       +
           if override_package_name:

     

       48
       +
               package_name = override_package_name

     

       49
       +
           else:

     

       50
       +
               package_name = "unknown"

     

       51
       +
               

     

       52
       +
           if override_package_version:

     

       53
       +
               package_version = override_package_version

     

       54
       +
           else:

     

       55
       +
               package_version = "unknown"

     

       56
       +
           

     

       57
       +
           # If we have both overrides, no need to analyze path

     

       58
       +
           if override_package_name and override_package_version:

     

       59
       +
               return package_name, package_version

     

       60
       +
           

     

       61
        
           # Use Path for more reliable path parsing

     

       62
        
           p = Path(path).resolve()

     

       63
        
           parts = list(p.parts)

     

       64
        
           

     

       65
       +
           if mode == 'single':

     

       66
       +
               # In single package mode, the package name is typically the directory name

     

       67
       +
               if not override_package_name and parts:

     

       68
       +
                   # Extract package name from the last part of the path

     

       69
       +
                   package_name = parts[-1]

     

       70
       +
                   

     

       71
       +
                   # Check if there's a subdirectory in the path that seems like a package name

     

       72
       +
                   subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)

     

       73
       +
                   if subdir:

     

       74
       +
                       package_name = subdir

     

       75
       +
           

     

       76
       +
           elif mode == 'full':

     

       77
       +
               # In full mode, we need to look at the directory structure more carefully

     

       78
       +
               # For test/ directory, the structure is test/package-name/package-version/

     

       79
       +
               

     

       80
       +
               # First, check if the directory structure matches the expected pattern

     

       81
       +
               # Look for subdirectories in the current path

     

       82
       +
               try:

     

       83
       +
                   subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

     

       84
       +
                   

     

       85
       +
                   # If we have subdirectories that might be package names

     

       86
       +
                   if subdirs and not override_package_name:

     

       87
       +
                       # For each subdirectory (potential package name), check if it contains version subdirectories

     

       88
       +
                       for subdir in subdirs:

     

       89
       +
                           version_dirs = [d for d in os.listdir(os.path.join(path, subdir)) 

     

       90
       +
                                          if os.path.isdir(os.path.join(path, subdir, d))]

     

       91
       +
                           

     

       92
       +
                           # If this subdirectory contains potential version directories, it's likely a package

     

       93
       +
                           if version_dirs:

     

       94
       +
                               # We'll use the current file's path to determine which package and version it belongs to

     

       95
       +
                               # We're processing files at the specific file level elsewhere, so here we just return 

     

       96
       +
                               # default values which will be overridden during actual file processing

     

       97
       +
                               return subdir, "unknown"

     

       98
       +
                               

     

       99
       +
                   # If we found no package structure or we're processing a file already in a package context

     

       100
       +
                   # In this case, we'll determine package/version from the path of the file being processed

     

       101
       +
                   if len(parts) >= 3:

     

       102
       +
                       # Path structure might be test/package-name/version/...

     

       103
       +
                       # Check if the first part is "test"

     

       104
       +
                       if parts[-3] == "test" or "test" in str(p):

     

       105
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       106
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       107
       +
                       else:

     

       108
       +
                           # Standard structure: .../package-name/package-version/...

     

       109
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       110
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       111
       +
               except (FileNotFoundError, PermissionError) as e:

     

       112
       +
                   # Handle cases where we can't access the directory

     

       113
       +
                   print(f"Error accessing directory {path}: {str(e)}")

     

       114
       +
           

     

       115
       +
           return package_name, package_version

     

       116
        
       

     

       117
        
       

     

       118
        
       def parse_html_content(content: str) -> List[Dict[str, Any]]:

     
···

       218
        
           Returns:

     

       219
        
               List of dictionaries containing extracted information

     

       220
        
           """

     

       221
       +
           # Extract package and version from file path if not already properly set

     

       222
       +
           if package_version == "unknown" or package_name == "unknown":

     

       223
       +
               # Check if this file is in a test directory structure

     

       224
       +
               file_path_parts = Path(file_path).resolve().parts

     

       225
       +
               

     

       226
       +
               # Look for test/package-name/version pattern in the path

     

       227
       +
               for i, part in enumerate(file_path_parts):

     

       228
       +
                   if part == "test" and i + 2 < len(file_path_parts):

     

       229
       +
                       # We found a test directory, extract package name and version

     

       230
       +
                       package_name = file_path_parts[i + 1]

     

       231
       +
                       package_version = file_path_parts[i + 2]

     

       232
       +
                       break

     

       233
       +
           

     

       234
       +
           try:

     

       235
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       236
       +
                   try:

     

       237
       +
                       data = json.load(f)

     

       238
       +
                   except json.JSONDecodeError:

     

       239
       +
                       print(f"Error decoding JSON from {file_path}")

     

       240
       +
                       return []

     

       241
       +
           except UnicodeDecodeError:

     

       242
       +
               # Try opening with a different encoding or with errors='ignore'

     

       243
        
               try:

     

       244
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       245
       +
                       try:

     

       246
       +
                           data = json.load(f)

     

       247
       +
                       except json.JSONDecodeError:

     

       248
       +
                           print(f"Error decoding JSON from {file_path} with latin-1 encoding")

     

       249
       +
                           return []

     

       250
       +
               except Exception as e:

     

       251
       +
                   print(f"Error reading {file_path}: {str(e)}")

     

       252
        
                   return []

     

       253
        
           

     

       254
        
           if 'content' not in data:

     
···

       283
        
           return items

     

       284
        
       

     

       285
        
       

     

       286
       +
       def worker_process_files(file_batch, package_name, package_version):

     

       287
       +
           """

     

       288
       +
           Worker function to process a batch of files in parallel.

     

       289
       +
           

     

       290
       +
           Args:

     

       291
       +
               file_batch: List of files to process

     

       292
       +
               package_name: Name of the package

     

       293
       +
               package_version: Version of the package

     

       294
       +
               

     

       295
       +
           Returns:

     

       296
       +
               List of all extracted items from all files in the batch

     

       297
       +
           """

     

       298
       +
           batch_items = []

     

       299
       +
           for file_path in file_batch:

     

       300
       +
               items = process_json_file(file_path, package_name, package_version)

     

       301
       +
               batch_items.extend(items)

     

       302
       +
           return batch_items

     

       303
       +
       

     

       304
       +
       

     

       305
       +
       def collect_json_files(directory):

     

       306
       +
           """

     

       307
       +
           Collect all JSON files in a directory recursively.

     

       308
       +
           

     

       309
       +
           Args:

     

       310
       +
               directory: Path to the directory to search

     

       311
       +
               

     

       312
       +
           Returns:

     

       313
       +
               List of file paths

     

       314
       +
           """

     

       315
       +
           json_files = []

     

       316
       +
           for root, _, files in os.walk(directory):

     

       317
       +
               for file in files:

     

       318
       +
                   if file.endswith('.html.json'):

     

       319
       +
                       json_files.append(os.path.join(root, file))

     

       320
       +
           return json_files

     

       321
       +
       

     

       322
       +
       

     

       323
       +
       def process_directory(directory: str, mode: str = 'full',

     

       324
       +
                            override_package_name: Optional[str] = None,

     

       325
       +
                            override_package_version: Optional[str] = None,

     

       326
       +
                            num_workers: int = 1) -> List[Dict[str, Any]]:

     

       327
        
           """

     

       328
       +
           Process all JSON files in a directory recursively using multiple processes.

     

       329
        
           

     

       330
        
           Args:

     

       331
        
               directory: Path to the directory containing odoc JSON files

     

       332
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       333
       +
               override_package_name: Optional override for package name

     

       334
       +
               override_package_version: Optional override for package version

     

       335
       +
               num_workers: Number of worker processes to use

     

       336
        
               

     

       337
        
           Returns:

     

       338
        
               List of all extracted items from all files

     

       339
        
           """

     

       340
       +
           package_name, package_version = extract_package_info(

     

       341
       +
               directory, 

     

       342
       +
               mode=mode,

     

       343
       +
               override_package_name=override_package_name,

     

       344
       +
               override_package_version=override_package_version

     

       345
       +
           )

     

       346
       +
           

     

       347
       +
           # Collect all JSON files

     

       348
       +
           json_files = collect_json_files(directory)

     

       349
       +
           total_files = len(json_files)

     

       350
       +
           

     

       351
       +
           if total_files == 0:

     

       352
       +
               print(f"No .html.json files found in {directory}")

     

       353
       +
               return []

     

       354
       +
           

     

       355
       +
           mode_str = "single package mode" if mode == 'single' else "full packages mode"

     

       356
       +
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       357
       +
           print(f"Using {num_workers} worker processes")

     

       358
       +
           

     

       359
       +
           # Split files into batches for workers

     

       360
       +
           batches = []

     

       361
       +
           batch_size = max(1, total_files // num_workers)

     

       362
       +
           for i in range(0, total_files, batch_size):

     

       363
       +
               batches.append(json_files[i:i + batch_size])

     

       364
       +
           

     

       365
       +
           # Create partial function with fixed package name and version

     

       366
       +
           process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)

     

       367
       +
           

     

       368
       +
           # Process batches in parallel

     

       369
       +
           start_time = time.time()

     

       370
        
           all_items = []

     

       0
        
       
     

       371
        
           

     

       372
       +
           if num_workers > 1:

     

       373
       +
               # Use multiprocessing Pool

     

       374
       +
               with mp.Pool(processes=num_workers) as pool:

     

       375
       +
                   # Submit all batches to the pool

     

       376
       +
                   results = pool.map(process_batch, batches)

     

       377
       +
                   # Collect all results

     

       378
       +
                   for batch_result in results:

     

       379
       +
                       all_items.extend(batch_result)

     

       380
       +
           else:

     

       381
       +
               # Single process mode

     

       382
       +
               all_items = process_batch(json_files)

     

       383
        
           

     

       384
       +
           elapsed_time = time.time() - start_time

     

       385
       +
           print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")

     

       386
       +
           print(f"Extracted {len(all_items)} items total")

     

       387
        
           return all_items

     

       388
        
       

     

       389
        
       

     

       390
        
       def main():

     

       391
       +
           """

     

       392
       +
           Main entry point for the script.

     

       393
       +
           

     

       394
       +
           Usage examples:

     

       395
       +
           

     

       396
       +
           # Process in full mode (multiple packages)

     

       397
       +
           python odoc2json.py /path/to/odoc/output output.json

     

       398
       +
           

     

       399
       +
           # Process a single package with automatic detection

     

       400
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single

     

       401
       +
           

     

       402
       +
           # Process with explicit package name and version

     

       403
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       404
       +
           

     

       405
       +
           # Process with multiple cores

     

       406
       +
           python odoc2json.py /path/to/odoc/output output.json --workers 8

     

       407
       +
           """

     

       408
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       409
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     

       410
        
           parser.add_argument('output_file', help='Output JSON file path')

     

       411
        
           parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')

     

       412
       +
           parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')

     

       413
       +
           parser.add_argument('--mode', choices=['full', 'single'], default='full', 

     

       414
       +
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       415
       +
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       416
       +
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       417
       +
           parser.add_argument('--workers', type=int, default=mp.cpu_count(), 

     

       418
       +
                               help=f'Number of worker processes (default: {mp.cpu_count()})')

     

       419
        
           args = parser.parse_args()

     

       420
        
           

     

       421
       +
           start_time = time.time()

     

       422
       +
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       423
       +
           

     

       424
       +
           # Process all files in the directory with multiple workers

     

       425
       +
           items = process_directory(

     

       426
       +
               args.input_dir, 

     

       427
       +
               mode=args.mode,

     

       428
       +
               override_package_name=args.package_name,

     

       429
       +
               override_package_version=args.package_version,

     

       430
       +
               num_workers=args.workers

     

       431
       +
           )

     

       432
        
           

     

       433
        
           # Write the output

     

       434
       +
           print(f"Writing {len(items)} items to {args.output_file}...")

     

       435
        
           with open(args.output_file, 'w', encoding='utf-8') as f:

     

       436
        
               if args.pretty:

     

       437
        
                   json.dump(items, f, indent=2, ensure_ascii=False)

     

       438
        
               else:

     

       439
        
                   json.dump(items, f, ensure_ascii=False)

     

       440
        
           

     

       441
       +
           elapsed_time = time.time() - start_time

     

       442
       +
           print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")

     

       443
       +
           print(f"Output saved to {args.output_file}")

     

       444
        
       

     

       445
        
       

     

       446
        
       if __name__ == "__main__":

     

       447
       +
           main()

+127 -27

odoc2llm.py

···

       26
        
       

     

       27
        
       def extract_module_info(json_content):

     

       28
        
           """Extract module information from odoc JSON content."""

     

       29
       -
           data = json.loads(json_content)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       30
        
           

     

       31
        
           # Extract module name and type from header

     

       32
        
           header = data.get("header", "")

     
···

       328
        
           return "\n".join(md_lines)

     

       329
        
       

     

       330
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       331
        
       def build_module_hierarchy(json_files, root_dir):

     

       332
        
           """Build a hierarchical structure from all the JSON files."""

     

       333
        
           hierarchy = defaultdict(list)

     
···

       340
        
               if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:

     

       341
        
                   # For index.html.json, check if it's a module documentation

     

       342
        
                   if package_parts[-1] == "index.html.json" and len(package_parts) > 1:

     

       343
       -
                       try:

     

       344
       -
                           with open(json_file, 'r', encoding='utf-8') as f:

     

       345
       -
                               json_content = f.read()

     

       346
       -
                           

     

       347
       -
                           # Try to parse the module info

     

       348
       -
                           module_info = extract_module_info(json_content)

     

       349
       -
                           signatures = parse_module_signature(module_info["content"])

     

       350
       -
                           

     

       351
       -
                           # Group by package/library

     

       352
       -
                           if len(package_parts) > 1:

     

       353
       -
                               package_name = package_parts[0]

     

       354
       -
                               hierarchy[package_name].append({

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       355
        
                                   "file": json_file,

     

       356
        
                                   "module_info": module_info,

     

       357
        
                                   "signatures": signatures,

     

       358
        
                                   "path_parts": package_parts

     

       359
        
                               })

     

       360
       -
                       except Exception as e:

     

       361
       -
                           print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       362
        
                   

     

       363
        
                   continue

     

       364
        
               

     

       365
        
               # Try to parse other JSON files (non-index.html.json)

     

       366
       -
               try:

     

       367
       -
                   with open(json_file, 'r', encoding='utf-8') as f:

     

       368
       -
                       json_content = f.read()

     

       369
       -
                   

     

       370
       -
                   module_info = extract_module_info(json_content)

     

       371
       -
                   signatures = parse_module_signature(module_info["content"])

     

       372
       -
                   

     

       373
       -
                   # Group by package/library

     

       374
       -
                   if len(package_parts) > 1:

     

       375
       -
                       package_name = package_parts[0]

     

       376
        
                       hierarchy[package_name].append({

     

       377
        
                           "file": json_file,

     

       378
        
                           "module_info": module_info,

     

       379
        
                           "signatures": signatures,

     

       380
        
                           "path_parts": package_parts

     

       381
        
                       })

     

       382
       -
               except Exception as e:

     

       383
       -
                   print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       384
        
           

     

       385
        
           return hierarchy

     

       386
        
       

     

       387
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       388
        
       def sort_modules_hierarchically(modules):

     

       389
        
           """Sort modules to ensure proper hierarchical presentation."""

     

       390
        
           # First sort by breadcrumb length (shorter = higher in hierarchy)

     
···

       414
        
       

     

       415
        
       

     

       416
        
       def main():

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       417
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')

     

       418
        
           parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')

     

       419
        
           parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')

···

       26
        
       

     

       27
        
       def extract_module_info(json_content):

     

       28
        
           """Extract module information from odoc JSON content."""

     

       29
       +
           try:

     

       30
       +
               data = json.loads(json_content)

     

       31
       +
           except json.JSONDecodeError as e:

     

       32
       +
               print(f"JSON decode error: {e}")

     

       33
       +
               # Return a minimal structure that won't cause errors downstream

     

       34
       +
               return {

     

       35
       +
                   "name": "Unknown",

     

       36
       +
                   "type": "Module",

     

       37
       +
                   "breadcrumbs": [],

     

       38
       +
                   "content": BeautifulSoup("", "html.parser"),

     

       39
       +
                   "preamble": ""

     

       40
       +
               }

     

       41
        
           

     

       42
        
           # Extract module name and type from header

     

       43
        
           header = data.get("header", "")

     
···

       339
        
           return "\n".join(md_lines)

     

       340
        
       

     

       341
        
       

     

       342
       +
       def read_json_file(file_path):

     

       343
       +
           """

     

       344
       +
           Read a JSON file with robust error handling for encoding issues.

     

       345
       +
           

     

       346
       +
           Args:

     

       347
       +
               file_path: Path to the JSON file

     

       348
       +
               

     

       349
       +
           Returns:

     

       350
       +
               Content of the JSON file as a string, or None if there was an error

     

       351
       +
           """

     

       352
       +
           # Try UTF-8 first (most common encoding)

     

       353
       +
           try:

     

       354
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       355
       +
                   return f.read()

     

       356
       +
           except UnicodeDecodeError:

     

       357
       +
               # Try other encodings if UTF-8 fails

     

       358
       +
               try:

     

       359
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       360
       +
                       return f.read()

     

       361
       +
               except Exception as e:

     

       362
       +
                   print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)

     

       363
       +
                   return None

     

       364
       +
       

     

       365
       +
       

     

       366
        
       def build_module_hierarchy(json_files, root_dir):

     

       367
        
           """Build a hierarchical structure from all the JSON files."""

     

       368
        
           hierarchy = defaultdict(list)

     
···

       375
        
               if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:

     

       376
        
                   # For index.html.json, check if it's a module documentation

     

       377
        
                   if package_parts[-1] == "index.html.json" and len(package_parts) > 1:

     

       378
       +
                       json_content = read_json_file(json_file)

     

       379
       +
                       if json_content:

     

       380
       +
                           try:

     

       381
       +
                               # Try to parse the module info

     

       382
       +
                               module_info = extract_module_info(json_content)

     

       383
       +
                               signatures = parse_module_signature(module_info["content"])

     

       384
       +
                               

     

       385
       +
                               # Determine package name and version from path

     

       386
       +
                               package_name, package_version = determine_package_info(json_file, package_parts, module_info)

     

       387
       +
                               

     

       388
       +
                               # Use package name and version for the hierarchy key

     

       389
       +
                               package_key = f"{package_name}"

     

       390
       +
                               if package_version != "unknown":

     

       391
       +
                                   # Add version information to module_info for display in markdown

     

       392
       +
                                   module_info["package_version"] = package_version

     

       393
       +
                               

     

       394
       +
                               hierarchy[package_key].append({

     

       395
        
                                   "file": json_file,

     

       396
        
                                   "module_info": module_info,

     

       397
        
                                   "signatures": signatures,

     

       398
        
                                   "path_parts": package_parts

     

       399
        
                               })

     

       400
       +
                           except Exception as e:

     

       401
       +
                               print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       402
        
                   

     

       403
        
                   continue

     

       404
        
               

     

       405
        
               # Try to parse other JSON files (non-index.html.json)

     

       406
       +
               json_content = read_json_file(json_file)

     

       407
       +
               if json_content:

     

       408
       +
                   try:

     

       409
       +
                       module_info = extract_module_info(json_content)

     

       410
       +
                       signatures = parse_module_signature(module_info["content"])

     

       411
       +
                       

     

       412
       +
                       # Determine package name from path

     

       413
       +
                       package_name = determine_package_name(package_parts, module_info)

     

       414
       +
                       

     

       0
        
       
     

       415
        
                       hierarchy[package_name].append({

     

       416
        
                           "file": json_file,

     

       417
        
                           "module_info": module_info,

     

       418
        
                           "signatures": signatures,

     

       419
        
                           "path_parts": package_parts

     

       420
        
                       })

     

       421
       +
                   except Exception as e:

     

       422
       +
                       print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       423
        
           

     

       424
        
           return hierarchy

     

       425
        
       

     

       426
        
       

     

       427
       +
       def determine_package_info(file_path, path_parts, module_info):

     

       428
       +
           """

     

       429
       +
           Determine package name and version from file path and module info.

     

       430
       +
           

     

       431
       +
           Args:

     

       432
       +
               file_path: The full file path

     

       433
       +
               path_parts: Parts of the path

     

       434
       +
               module_info: Extracted module information

     

       435
       +
               

     

       436
       +
           Returns:

     

       437
       +
               Tuple of (package_name, package_version)

     

       438
       +
           """

     

       439
       +
           package_name = "unknown"

     

       440
       +
           package_version = "unknown"

     

       441
       +
           

     

       442
       +
           # Try to extract from breadcrumbs if available

     

       443
       +
           if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):

     

       444
       +
               for crumb in module_info["breadcrumbs"]:

     

       445
       +
                   if "Library" in crumb:

     

       446
       +
                       # Extract library name from the breadcrumb

     

       447
       +
                       match = re.search(r'Library\s+(.+)', crumb)

     

       448
       +
                       if match:

     

       449
       +
                           package_name = match.group(1).strip()

     

       450
       +
           

     

       451
       +
           # Look for test/package-name/version pattern in the path

     

       452
       +
           file_path_parts = Path(file_path).resolve().parts

     

       453
       +
           for i, part in enumerate(file_path_parts):

     

       454
       +
               if part == "test" and i + 2 < len(file_path_parts):

     

       455
       +
                   # We found a test directory, extract package name and version

     

       456
       +
                   package_name = file_path_parts[i + 1]

     

       457
       +
                   package_version = file_path_parts[i + 2]

     

       458
       +
                   break

     

       459
       +
           

     

       460
       +
           # If still unknown, fall back to using the first part of the path

     

       461
       +
           if package_name == "unknown" and len(path_parts) > 0:

     

       462
       +
               package_name = path_parts[0]

     

       463
       +
           

     

       464
       +
           # Last resort - use module name or "unknown"

     

       465
       +
           if package_name == "unknown":

     

       466
       +
               package_name = module_info["name"] if module_info["name"] else "unknown"

     

       467
       +
           

     

       468
       +
           return package_name, package_version

     

       469
       +
       

     

       470
       +
       

     

       471
        
       def sort_modules_hierarchically(modules):

     

       472
        
           """Sort modules to ensure proper hierarchical presentation."""

     

       473
        
           # First sort by breadcrumb length (shorter = higher in hierarchy)

     
···

       497
        
       

     

       498
        
       

     

       499
        
       def main():

     

       500
       +
           """

     

       501
       +
           Main entry point for the script.

     

       502
       +
           

     

       503
       +
           Usage examples:

     

       504
       +
           

     

       505
       +
           # Process all packages in a directory

     

       506
       +
           python odoc2llm.py /path/to/odoc/output

     

       507
       +
           

     

       508
       +
           # Process all packages and specify output file

     

       509
       +
           python odoc2llm.py /path/to/odoc/output --output documentation.md

     

       510
       +
           

     

       511
       +
           # Process a specific package only

     

       512
       +
           python odoc2llm.py /path/to/odoc/output --package package-name

     

       513
       +
           

     

       514
       +
           # Enable verbose output

     

       515
       +
           python odoc2llm.py /path/to/odoc/output --verbose

     

       516
       +
           """

     

       517
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')

     

       518
        
           parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')

     

       519
        
           parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')

Compare changes