comparing 4177be63b2e3505e8ac21849f8d2cbdf21aa71a5 and main on anil.recoil.org/odoc-mcp

+235 -28

odoc2json.py

···

       19
       19
        
       import os

     

       20
       20
        
       import json

     

       21
       21
        
       import re

     

       22
       22
       +
       import time

     

       23
       23
       +
       import multiprocessing as mp

     

       22
       24
        
       from bs4 import BeautifulSoup

     

       23
       25
        
       from typing import Dict, List, Any, Optional, Tuple

     

       24
       26
        
       import argparse

     

       25
       27
        
       from pathlib import Path

     

       28
       28
       +
       from functools import partial

     

       26
       29
        
       

     

       27
       30
        
       

     

       28
       28
       -
       def extract_package_info(path: str) -> Tuple[str, str]:

     

       31
       31
       +
       def extract_package_info(path: str, mode: str = 'full', 

     

       32
       32
       +
                              override_package_name: Optional[str] = None,

     

       33
       33
       +
                              override_package_version: Optional[str] = None) -> Tuple[str, str]:

     

       29
       34
        
           """

     

       30
       35
        
           Extract package name and version from the path.

     

       31
       36
        
           

     

       32
       37
        
           Args:

     

       33
       38
        
               path: Path to the odoc output directory

     

       39
       39
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       40
       40
       +
               override_package_name: Optional override for package name

     

       41
       41
       +
               override_package_version: Optional override for package version

     

       34
       42
        
               

     

       35
       43
        
           Returns:

     

       36
       44
        
               Tuple of (package_name, package_version)

     

       37
       45
        
           """

     

       46
       46
       +
           # Always prioritize explicit overrides if provided

     

       47
       47
       +
           if override_package_name:

     

       48
       48
       +
               package_name = override_package_name

     

       49
       49
       +
           else:

     

       50
       50
       +
               package_name = "unknown"

     

       51
       51
       +
               

     

       52
       52
       +
           if override_package_version:

     

       53
       53
       +
               package_version = override_package_version

     

       54
       54
       +
           else:

     

       55
       55
       +
               package_version = "unknown"

     

       56
       56
       +
           

     

       57
       57
       +
           # If we have both overrides, no need to analyze path

     

       58
       58
       +
           if override_package_name and override_package_version:

     

       59
       59
       +
               return package_name, package_version

     

       60
       60
       +
           

     

       38
       61
        
           # Use Path for more reliable path parsing

     

       39
       62
        
           p = Path(path).resolve()

     

       40
       63
        
           parts = list(p.parts)

     

       41
       64
        
           

     

       42
       42
       -
           # If the path is in the format ".../package_name/version/..."

     

       43
       43
       -
           if len(parts) >= 2:

     

       44
       44
       -
               # The package name is typically the second-to-last component

     

       45
       45
       -
               # The version is typically the last component

     

       46
       46
       -
               return parts[-2], parts[-1]

     

       47
       47
       -
           elif len(parts) == 1:

     

       48
       48
       -
               # If only one component, assume it's the package name

     

       49
       49
       -
               return parts[0], "unknown"

     

       50
       50
       -
           else:

     

       51
       51
       -
               return "unknown", "unknown"

     

       65
       65
       +
           if mode == 'single':

     

       66
       66
       +
               # In single package mode, the package name is typically the directory name

     

       67
       67
       +
               if not override_package_name and parts:

     

       68
       68
       +
                   # Extract package name from the last part of the path

     

       69
       69
       +
                   package_name = parts[-1]

     

       70
       70
       +
                   

     

       71
       71
       +
                   # Check if there's a subdirectory in the path that seems like a package name

     

       72
       72
       +
                   subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)

     

       73
       73
       +
                   if subdir:

     

       74
       74
       +
                       package_name = subdir

     

       75
       75
       +
           

     

       76
       76
       +
           elif mode == 'full':

     

       77
       77
       +
               # In full mode, we need to look at the directory structure more carefully

     

       78
       78
       +
               # For test/ directory, the structure is test/package-name/package-version/

     

       79
       79
       +
               

     

       80
       80
       +
               # First, check if the directory structure matches the expected pattern

     

       81
       81
       +
               # Look for subdirectories in the current path

     

       82
       82
       +
               try:

     

       83
       83
       +
                   subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

     

       84
       84
       +
                   

     

       85
       85
       +
                   # If we have subdirectories that might be package names

     

       86
       86
       +
                   if subdirs and not override_package_name:

     

       87
       87
       +
                       # For each subdirectory (potential package name), check if it contains version subdirectories

     

       88
       88
       +
                       for subdir in subdirs:

     

       89
       89
       +
                           version_dirs = [d for d in os.listdir(os.path.join(path, subdir)) 

     

       90
       90
       +
                                          if os.path.isdir(os.path.join(path, subdir, d))]

     

       91
       91
       +
                           

     

       92
       92
       +
                           # If this subdirectory contains potential version directories, it's likely a package

     

       93
       93
       +
                           if version_dirs:

     

       94
       94
       +
                               # We'll use the current file's path to determine which package and version it belongs to

     

       95
       95
       +
                               # We're processing files at the specific file level elsewhere, so here we just return 

     

       96
       96
       +
                               # default values which will be overridden during actual file processing

     

       97
       97
       +
                               return subdir, "unknown"

     

       98
       98
       +
                               

     

       99
       99
       +
                   # If we found no package structure or we're processing a file already in a package context

     

       100
       100
       +
                   # In this case, we'll determine package/version from the path of the file being processed

     

       101
       101
       +
                   if len(parts) >= 3:

     

       102
       102
       +
                       # Path structure might be test/package-name/version/...

     

       103
       103
       +
                       # Check if the first part is "test"

     

       104
       104
       +
                       if parts[-3] == "test" or "test" in str(p):

     

       105
       105
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       106
       106
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       107
       107
       +
                       else:

     

       108
       108
       +
                           # Standard structure: .../package-name/package-version/...

     

       109
       109
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       110
       110
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       111
       111
       +
               except (FileNotFoundError, PermissionError) as e:

     

       112
       112
       +
                   # Handle cases where we can't access the directory

     

       113
       113
       +
                   print(f"Error accessing directory {path}: {str(e)}")

     

       114
       114
       +
           

     

       115
       115
       +
           return package_name, package_version

     

       52
       116
        
       

     

       53
       117
        
       

     

       54
       118
        
       def parse_html_content(content: str) -> List[Dict[str, Any]]:

     
···

       154
       218
        
           Returns:

     

       155
       219
        
               List of dictionaries containing extracted information

     

       156
       220
        
           """

     

       157
       157
       -
           with open(file_path, 'r', encoding='utf-8') as f:

     

       221
       221
       +
           # Extract package and version from file path if not already properly set

     

       222
       222
       +
           if package_version == "unknown" or package_name == "unknown":

     

       223
       223
       +
               # Check if this file is in a test directory structure

     

       224
       224
       +
               file_path_parts = Path(file_path).resolve().parts

     

       225
       225
       +
               

     

       226
       226
       +
               # Look for test/package-name/version pattern in the path

     

       227
       227
       +
               for i, part in enumerate(file_path_parts):

     

       228
       228
       +
                   if part == "test" and i + 2 < len(file_path_parts):

     

       229
       229
       +
                       # We found a test directory, extract package name and version

     

       230
       230
       +
                       package_name = file_path_parts[i + 1]

     

       231
       231
       +
                       package_version = file_path_parts[i + 2]

     

       232
       232
       +
                       break

     

       233
       233
       +
           

     

       234
       234
       +
           try:

     

       235
       235
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       236
       236
       +
                   try:

     

       237
       237
       +
                       data = json.load(f)

     

       238
       238
       +
                   except json.JSONDecodeError:

     

       239
       239
       +
                       print(f"Error decoding JSON from {file_path}")

     

       240
       240
       +
                       return []

     

       241
       241
       +
           except UnicodeDecodeError:

     

       242
       242
       +
               # Try opening with a different encoding or with errors='ignore'

     

       158
       243
        
               try:

     

       159
       159
       -
                   data = json.load(f)

     

       160
       160
       -
               except json.JSONDecodeError:

     

       161
       161
       -
                   print(f"Error decoding JSON from {file_path}")

     

       244
       244
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       245
       245
       +
                       try:

     

       246
       246
       +
                           data = json.load(f)

     

       247
       247
       +
                       except json.JSONDecodeError:

     

       248
       248
       +
                           print(f"Error decoding JSON from {file_path} with latin-1 encoding")

     

       249
       249
       +
                           return []

     

       250
       250
       +
               except Exception as e:

     

       251
       251
       +
                   print(f"Error reading {file_path}: {str(e)}")

     

       162
       252
        
                   return []

     

       163
       253
        
           

     

       164
       254
        
           if 'content' not in data:

     
···

       193
       283
        
           return items

     

       194
       284
        
       

     

       195
       285
        
       

     

       196
       196
       -
       def process_directory(directory: str) -> List[Dict[str, Any]]:

     

       286
       286
       +
       def worker_process_files(file_batch, package_name, package_version):

     

       287
       287
       +
           """

     

       288
       288
       +
           Worker function to process a batch of files in parallel.

     

       289
       289
       +
           

     

       290
       290
       +
           Args:

     

       291
       291
       +
               file_batch: List of files to process

     

       292
       292
       +
               package_name: Name of the package

     

       293
       293
       +
               package_version: Version of the package

     

       294
       294
       +
               

     

       295
       295
       +
           Returns:

     

       296
       296
       +
               List of all extracted items from all files in the batch

     

       297
       297
       +
           """

     

       298
       298
       +
           batch_items = []

     

       299
       299
       +
           for file_path in file_batch:

     

       300
       300
       +
               items = process_json_file(file_path, package_name, package_version)

     

       301
       301
       +
               batch_items.extend(items)

     

       302
       302
       +
           return batch_items

     

       303
       303
       +
       

     

       304
       304
       +
       

     

       305
       305
       +
       def collect_json_files(directory):

     

       306
       306
       +
           """

     

       307
       307
       +
           Collect all JSON files in a directory recursively.

     

       308
       308
       +
           

     

       309
       309
       +
           Args:

     

       310
       310
       +
               directory: Path to the directory to search

     

       311
       311
       +
               

     

       312
       312
       +
           Returns:

     

       313
       313
       +
               List of file paths

     

       314
       314
       +
           """

     

       315
       315
       +
           json_files = []

     

       316
       316
       +
           for root, _, files in os.walk(directory):

     

       317
       317
       +
               for file in files:

     

       318
       318
       +
                   if file.endswith('.html.json'):

     

       319
       319
       +
                       json_files.append(os.path.join(root, file))

     

       320
       320
       +
           return json_files

     

       321
       321
       +
       

     

       322
       322
       +
       

     

       323
       323
       +
       def process_directory(directory: str, mode: str = 'full',

     

       324
       324
       +
                            override_package_name: Optional[str] = None,

     

       325
       325
       +
                            override_package_version: Optional[str] = None,

     

       326
       326
       +
                            num_workers: int = 1) -> List[Dict[str, Any]]:

     

       197
       327
        
           """

     

       198
       198
       -
           Process all JSON files in a directory recursively.

     

       328
       328
       +
           Process all JSON files in a directory recursively using multiple processes.

     

       199
       329
        
           

     

       200
       330
        
           Args:

     

       201
       331
        
               directory: Path to the directory containing odoc JSON files

     

       332
       332
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       333
       333
       +
               override_package_name: Optional override for package name

     

       334
       334
       +
               override_package_version: Optional override for package version

     

       335
       335
       +
               num_workers: Number of worker processes to use

     

       202
       336
        
               

     

       203
       337
        
           Returns:

     

       204
       338
        
               List of all extracted items from all files

     

       205
       339
        
           """

     

       340
       340
       +
           package_name, package_version = extract_package_info(

     

       341
       341
       +
               directory, 

     

       342
       342
       +
               mode=mode,

     

       343
       343
       +
               override_package_name=override_package_name,

     

       344
       344
       +
               override_package_version=override_package_version

     

       345
       345
       +
           )

     

       346
       346
       +
           

     

       347
       347
       +
           # Collect all JSON files

     

       348
       348
       +
           json_files = collect_json_files(directory)

     

       349
       349
       +
           total_files = len(json_files)

     

       350
       350
       +
           

     

       351
       351
       +
           if total_files == 0:

     

       352
       352
       +
               print(f"No .html.json files found in {directory}")

     

       353
       353
       +
               return []

     

       354
       354
       +
           

     

       355
       355
       +
           mode_str = "single package mode" if mode == 'single' else "full packages mode"

     

       356
       356
       +
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       357
       357
       +
           print(f"Using {num_workers} worker processes")

     

       358
       358
       +
           

     

       359
       359
       +
           # Split files into batches for workers

     

       360
       360
       +
           batches = []

     

       361
       361
       +
           batch_size = max(1, total_files // num_workers)

     

       362
       362
       +
           for i in range(0, total_files, batch_size):

     

       363
       363
       +
               batches.append(json_files[i:i + batch_size])

     

       364
       364
       +
           

     

       365
       365
       +
           # Create partial function with fixed package name and version

     

       366
       366
       +
           process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)

     

       367
       367
       +
           

     

       368
       368
       +
           # Process batches in parallel

     

       369
       369
       +
           start_time = time.time()

     

       206
       370
        
           all_items = []

     

       207
       207
       -
           package_name, package_version = extract_package_info(directory)

     

       208
       371
        
           

     

       209
       209
       -
           for root, _, files in os.walk(directory):

     

       210
       210
       -
               for file in files:

     

       211
       211
       -
                   if file.endswith('.html.json'):

     

       212
       212
       -
                       file_path = os.path.join(root, file)

     

       213
       213
       -
                       items = process_json_file(file_path, package_name, package_version)

     

       214
       214
       -
                       all_items.extend(items)

     

       372
       372
       +
           if num_workers > 1:

     

       373
       373
       +
               # Use multiprocessing Pool

     

       374
       374
       +
               with mp.Pool(processes=num_workers) as pool:

     

       375
       375
       +
                   # Submit all batches to the pool

     

       376
       376
       +
                   results = pool.map(process_batch, batches)

     

       377
       377
       +
                   # Collect all results

     

       378
       378
       +
                   for batch_result in results:

     

       379
       379
       +
                       all_items.extend(batch_result)

     

       380
       380
       +
           else:

     

       381
       381
       +
               # Single process mode

     

       382
       382
       +
               all_items = process_batch(json_files)

     

       215
       383
        
           

     

       384
       384
       +
           elapsed_time = time.time() - start_time

     

       385
       385
       +
           print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")

     

       386
       386
       +
           print(f"Extracted {len(all_items)} items total")

     

       216
       387
        
           return all_items

     

       217
       388
        
       

     

       218
       389
        
       

     

       219
       390
        
       def main():

     

       391
       391
       +
           """

     

       392
       392
       +
           Main entry point for the script.

     

       393
       393
       +
           

     

       394
       394
       +
           Usage examples:

     

       395
       395
       +
           

     

       396
       396
       +
           # Process in full mode (multiple packages)

     

       397
       397
       +
           python odoc2json.py /path/to/odoc/output output.json

     

       398
       398
       +
           

     

       399
       399
       +
           # Process a single package with automatic detection

     

       400
       400
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single

     

       401
       401
       +
           

     

       402
       402
       +
           # Process with explicit package name and version

     

       403
       403
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       404
       404
       +
           

     

       405
       405
       +
           # Process with multiple cores

     

       406
       406
       +
           python odoc2json.py /path/to/odoc/output output.json --workers 8

     

       407
       407
       +
           """

     

       220
       408
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       221
       409
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     

       222
       410
        
           parser.add_argument('output_file', help='Output JSON file path')

     

       223
       411
        
           parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')

     

       412
       412
       +
           parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')

     

       413
       413
       +
           parser.add_argument('--mode', choices=['full', 'single'], default='full', 

     

       414
       414
       +
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       415
       415
       +
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       416
       416
       +
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       417
       417
       +
           parser.add_argument('--workers', type=int, default=mp.cpu_count(), 

     

       418
       418
       +
                               help=f'Number of worker processes (default: {mp.cpu_count()})')

     

       224
       419
        
           args = parser.parse_args()

     

       225
       420
        
           

     

       226
       226
       -
           # Process all files in the directory

     

       227
       227
       -
           items = process_directory(args.input_dir)

     

       421
       421
       +
           start_time = time.time()

     

       422
       422
       +
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       423
       423
       +
           

     

       424
       424
       +
           # Process all files in the directory with multiple workers

     

       425
       425
       +
           items = process_directory(

     

       426
       426
       +
               args.input_dir, 

     

       427
       427
       +
               mode=args.mode,

     

       428
       428
       +
               override_package_name=args.package_name,

     

       429
       429
       +
               override_package_version=args.package_version,

     

       430
       430
       +
               num_workers=args.workers

     

       431
       431
       +
           )

     

       228
       432
        
           

     

       229
       433
        
           # Write the output

     

       434
       434
       +
           print(f"Writing {len(items)} items to {args.output_file}...")

     

       230
       435
        
           with open(args.output_file, 'w', encoding='utf-8') as f:

     

       231
       436
        
               if args.pretty:

     

       232
       437
        
                   json.dump(items, f, indent=2, ensure_ascii=False)

     

       233
       438
        
               else:

     

       234
       439
        
                   json.dump(items, f, ensure_ascii=False)

     

       235
       440
        
           

     

       236
       236
       -
           print(f"Processed {len(items)} items and saved to {args.output_file}")

     

       441
       441
       +
           elapsed_time = time.time() - start_time

     

       442
       442
       +
           print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")

     

       443
       443
       +
           print(f"Output saved to {args.output_file}")

     

       237
       444
        
       

     

       238
       445
        
       

     

       239
       446
        
       if __name__ == "__main__":

     

       240
       240
       -
           main()

     

       447
       447
       +
           main()

+127 -27

odoc2llm.py

···

       26
       26
        
       

     

       27
       27
        
       def extract_module_info(json_content):

     

       28
       28
        
           """Extract module information from odoc JSON content."""

     

       29
       29
       -
           data = json.loads(json_content)

     

       29
       29
       +
           try:

     

       30
       30
       +
               data = json.loads(json_content)

     

       31
       31
       +
           except json.JSONDecodeError as e:

     

       32
       32
       +
               print(f"JSON decode error: {e}")

     

       33
       33
       +
               # Return a minimal structure that won't cause errors downstream

     

       34
       34
       +
               return {

     

       35
       35
       +
                   "name": "Unknown",

     

       36
       36
       +
                   "type": "Module",

     

       37
       37
       +
                   "breadcrumbs": [],

     

       38
       38
       +
                   "content": BeautifulSoup("", "html.parser"),

     

       39
       39
       +
                   "preamble": ""

     

       40
       40
       +
               }

     

       30
       41
        
           

     

       31
       42
        
           # Extract module name and type from header

     

       32
       43
        
           header = data.get("header", "")

     
···

       328
       339
        
           return "\n".join(md_lines)

     

       329
       340
        
       

     

       330
       341
        
       

     

       342
       342
       +
       def read_json_file(file_path):

     

       343
       343
       +
           """

     

       344
       344
       +
           Read a JSON file with robust error handling for encoding issues.

     

       345
       345
       +
           

     

       346
       346
       +
           Args:

     

       347
       347
       +
               file_path: Path to the JSON file

     

       348
       348
       +
               

     

       349
       349
       +
           Returns:

     

       350
       350
       +
               Content of the JSON file as a string, or None if there was an error

     

       351
       351
       +
           """

     

       352
       352
       +
           # Try UTF-8 first (most common encoding)

     

       353
       353
       +
           try:

     

       354
       354
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       355
       355
       +
                   return f.read()

     

       356
       356
       +
           except UnicodeDecodeError:

     

       357
       357
       +
               # Try other encodings if UTF-8 fails

     

       358
       358
       +
               try:

     

       359
       359
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       360
       360
       +
                       return f.read()

     

       361
       361
       +
               except Exception as e:

     

       362
       362
       +
                   print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)

     

       363
       363
       +
                   return None

     

       364
       364
       +
       

     

       365
       365
       +
       

     

       331
       366
        
       def build_module_hierarchy(json_files, root_dir):

     

       332
       367
        
           """Build a hierarchical structure from all the JSON files."""

     

       333
       368
        
           hierarchy = defaultdict(list)

     
···

       340
       375
        
               if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:

     

       341
       376
        
                   # For index.html.json, check if it's a module documentation

     

       342
       377
        
                   if package_parts[-1] == "index.html.json" and len(package_parts) > 1:

     

       343
       343
       -
                       try:

     

       344
       344
       -
                           with open(json_file, 'r', encoding='utf-8') as f:

     

       345
       345
       -
                               json_content = f.read()

     

       346
       346
       -
                           

     

       347
       347
       -
                           # Try to parse the module info

     

       348
       348
       -
                           module_info = extract_module_info(json_content)

     

       349
       349
       -
                           signatures = parse_module_signature(module_info["content"])

     

       350
       350
       -
                           

     

       351
       351
       -
                           # Group by package/library

     

       352
       352
       -
                           if len(package_parts) > 1:

     

       353
       353
       -
                               package_name = package_parts[0]

     

       354
       354
       -
                               hierarchy[package_name].append({

     

       378
       378
       +
                       json_content = read_json_file(json_file)

     

       379
       379
       +
                       if json_content:

     

       380
       380
       +
                           try:

     

       381
       381
       +
                               # Try to parse the module info

     

       382
       382
       +
                               module_info = extract_module_info(json_content)

     

       383
       383
       +
                               signatures = parse_module_signature(module_info["content"])

     

       384
       384
       +
                               

     

       385
       385
       +
                               # Determine package name and version from path

     

       386
       386
       +
                               package_name, package_version = determine_package_info(json_file, package_parts, module_info)

     

       387
       387
       +
                               

     

       388
       388
       +
                               # Use package name and version for the hierarchy key

     

       389
       389
       +
                               package_key = f"{package_name}"

     

       390
       390
       +
                               if package_version != "unknown":

     

       391
       391
       +
                                   # Add version information to module_info for display in markdown

     

       392
       392
       +
                                   module_info["package_version"] = package_version

     

       393
       393
       +
                               

     

       394
       394
       +
                               hierarchy[package_key].append({

     

       355
       395
        
                                   "file": json_file,

     

       356
       396
        
                                   "module_info": module_info,

     

       357
       397
        
                                   "signatures": signatures,

     

       358
       398
        
                                   "path_parts": package_parts

     

       359
       399
        
                               })

     

       360
       360
       -
                       except Exception as e:

     

       361
       361
       -
                           print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       400
       400
       +
                           except Exception as e:

     

       401
       401
       +
                               print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       362
       402
        
                   

     

       363
       403
        
                   continue

     

       364
       404
        
               

     

       365
       405
        
               # Try to parse other JSON files (non-index.html.json)

     

       366
       366
       -
               try:

     

       367
       367
       -
                   with open(json_file, 'r', encoding='utf-8') as f:

     

       368
       368
       -
                       json_content = f.read()

     

       369
       369
       -
                   

     

       370
       370
       -
                   module_info = extract_module_info(json_content)

     

       371
       371
       -
                   signatures = parse_module_signature(module_info["content"])

     

       372
       372
       -
                   

     

       373
       373
       -
                   # Group by package/library

     

       374
       374
       -
                   if len(package_parts) > 1:

     

       375
       375
       -
                       package_name = package_parts[0]

     

       406
       406
       +
               json_content = read_json_file(json_file)

     

       407
       407
       +
               if json_content:

     

       408
       408
       +
                   try:

     

       409
       409
       +
                       module_info = extract_module_info(json_content)

     

       410
       410
       +
                       signatures = parse_module_signature(module_info["content"])

     

       411
       411
       +
                       

     

       412
       412
       +
                       # Determine package name from path

     

       413
       413
       +
                       package_name = determine_package_name(package_parts, module_info)

     

       414
       414
       +
                       

     

       376
       415
        
                       hierarchy[package_name].append({

     

       377
       416
        
                           "file": json_file,

     

       378
       417
        
                           "module_info": module_info,

     

       379
       418
        
                           "signatures": signatures,

     

       380
       419
        
                           "path_parts": package_parts

     

       381
       420
        
                       })

     

       382
       382
       -
               except Exception as e:

     

       383
       383
       -
                   print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       421
       421
       +
                   except Exception as e:

     

       422
       422
       +
                       print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       384
       423
        
           

     

       385
       424
        
           return hierarchy

     

       386
       425
        
       

     

       387
       426
        
       

     

       427
       427
       +
       def determine_package_info(file_path, path_parts, module_info):

     

       428
       428
       +
           """

     

       429
       429
       +
           Determine package name and version from file path and module info.

     

       430
       430
       +
           

     

       431
       431
       +
           Args:

     

       432
       432
       +
               file_path: The full file path

     

       433
       433
       +
               path_parts: Parts of the path

     

       434
       434
       +
               module_info: Extracted module information

     

       435
       435
       +
               

     

       436
       436
       +
           Returns:

     

       437
       437
       +
               Tuple of (package_name, package_version)

     

       438
       438
       +
           """

     

       439
       439
       +
           package_name = "unknown"

     

       440
       440
       +
           package_version = "unknown"

     

       441
       441
       +
           

     

       442
       442
       +
           # Try to extract from breadcrumbs if available

     

       443
       443
       +
           if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):

     

       444
       444
       +
               for crumb in module_info["breadcrumbs"]:

     

       445
       445
       +
                   if "Library" in crumb:

     

       446
       446
       +
                       # Extract library name from the breadcrumb

     

       447
       447
       +
                       match = re.search(r'Library\s+(.+)', crumb)

     

       448
       448
       +
                       if match:

     

       449
       449
       +
                           package_name = match.group(1).strip()

     

       450
       450
       +
           

     

       451
       451
       +
           # Look for test/package-name/version pattern in the path

     

       452
       452
       +
           file_path_parts = Path(file_path).resolve().parts

     

       453
       453
       +
           for i, part in enumerate(file_path_parts):

     

       454
       454
       +
               if part == "test" and i + 2 < len(file_path_parts):

     

       455
       455
       +
                   # We found a test directory, extract package name and version

     

       456
       456
       +
                   package_name = file_path_parts[i + 1]

     

       457
       457
       +
                   package_version = file_path_parts[i + 2]

     

       458
       458
       +
                   break

     

       459
       459
       +
           

     

       460
       460
       +
           # If still unknown, fall back to using the first part of the path

     

       461
       461
       +
           if package_name == "unknown" and len(path_parts) > 0:

     

       462
       462
       +
               package_name = path_parts[0]

     

       463
       463
       +
           

     

       464
       464
       +
           # Last resort - use module name or "unknown"

     

       465
       465
       +
           if package_name == "unknown":

     

       466
       466
       +
               package_name = module_info["name"] if module_info["name"] else "unknown"

     

       467
       467
       +
           

     

       468
       468
       +
           return package_name, package_version

     

       469
       469
       +
       

     

       470
       470
       +
       

     

       388
       471
        
       def sort_modules_hierarchically(modules):

     

       389
       472
        
           """Sort modules to ensure proper hierarchical presentation."""

     

       390
       473
        
           # First sort by breadcrumb length (shorter = higher in hierarchy)

     
···

       414
       497
        
       

     

       415
       498
        
       

     

       416
       499
        
       def main():

     

       500
       500
       +
           """

     

       501
       501
       +
           Main entry point for the script.

     

       502
       502
       +
           

     

       503
       503
       +
           Usage examples:

     

       504
       504
       +
           

     

       505
       505
       +
           # Process all packages in a directory

     

       506
       506
       +
           python odoc2llm.py /path/to/odoc/output

     

       507
       507
       +
           

     

       508
       508
       +
           # Process all packages and specify output file

     

       509
       509
       +
           python odoc2llm.py /path/to/odoc/output --output documentation.md

     

       510
       510
       +
           

     

       511
       511
       +
           # Process a specific package only

     

       512
       512
       +
           python odoc2llm.py /path/to/odoc/output --package package-name

     

       513
       513
       +
           

     

       514
       514
       +
           # Enable verbose output

     

       515
       515
       +
           python odoc2llm.py /path/to/odoc/output --verbose

     

       516
       516
       +
           """

     

       417
       517
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')

     

       418
       518
        
           parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')

     

       419
       519
        
           parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')

Compare changes