commit dd481f9974fd400bf234c4d273cef076adcbf394 · anil.recoil.org/odoc-mcp

+176 -19

odoc2json.py

···

       19
       19
        
       import os

     

       20
       20
        
       import json

     

       21
       21
        
       import re

     

       22
       22
       +
       import time

     

       22
       23
        
       from bs4 import BeautifulSoup

     

       23
       24
        
       from typing import Dict, List, Any, Optional, Tuple

     

       24
       25
        
       import argparse

     

       25
       26
        
       from pathlib import Path

     

       26
       27
        
       

     

       27
       28
        
       

     

       28
       28
       -
       def extract_package_info(path: str) -> Tuple[str, str]:

     

       29
       29
       +
       def extract_package_info(path: str, mode: str = 'full', 

     

       30
       30
       +
                              override_package_name: Optional[str] = None,

     

       31
       31
       +
                              override_package_version: Optional[str] = None) -> Tuple[str, str]:

     

       29
       32
        
           """

     

       30
       33
        
           Extract package name and version from the path.

     

       31
       34
        
           

     

       32
       35
        
           Args:

     

       33
       36
        
               path: Path to the odoc output directory

     

       37
       37
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       38
       38
       +
               override_package_name: Optional override for package name

     

       39
       39
       +
               override_package_version: Optional override for package version

     

       34
       40
        
               

     

       35
       41
        
           Returns:

     

       36
       42
        
               Tuple of (package_name, package_version)

     

       37
       43
        
           """

     

       44
       44
       +
           # Always prioritize explicit overrides if provided

     

       45
       45
       +
           if override_package_name:

     

       46
       46
       +
               package_name = override_package_name

     

       47
       47
       +
           else:

     

       48
       48
       +
               package_name = "unknown"

     

       49
       49
       +
               

     

       50
       50
       +
           if override_package_version:

     

       51
       51
       +
               package_version = override_package_version

     

       52
       52
       +
           else:

     

       53
       53
       +
               package_version = "unknown"

     

       54
       54
       +
           

     

       55
       55
       +
           # If we have both overrides, no need to analyze path

     

       56
       56
       +
           if override_package_name and override_package_version:

     

       57
       57
       +
               return package_name, package_version

     

       58
       58
       +
           

     

       38
       59
        
           # Use Path for more reliable path parsing

     

       39
       60
        
           p = Path(path).resolve()

     

       40
       61
        
           parts = list(p.parts)

     

       41
       62
        
           

     

       42
       42
       -
           # If the path is in the format ".../package_name/version/..."

     

       43
       43
       -
           if len(parts) >= 2:

     

       44
       44
       -
               # The package name is typically the second-to-last component

     

       45
       45
       -
               # The version is typically the last component

     

       46
       46
       -
               return parts[-2], parts[-1]

     

       47
       47
       -
           elif len(parts) == 1:

     

       48
       48
       -
               # If only one component, assume it's the package name

     

       49
       49
       -
               return parts[0], "unknown"

     

       50
       50
       -
           else:

     

       51
       51
       -
               return "unknown", "unknown"

     

       63
       63
       +
           if mode == 'single':

     

       64
       64
       +
               # In single package mode, the package name is typically the directory name

     

       65
       65
       +
               if not override_package_name and parts:

     

       66
       66
       +
                   # Extract package name from the last part of the path

     

       67
       67
       +
                   package_name = parts[-1]

     

       68
       68
       +
                   

     

       69
       69
       +
                   # Check if there's a subdirectory in the path that seems like a package name

     

       70
       70
       +
                   subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)

     

       71
       71
       +
                   if subdir:

     

       72
       72
       +
                       package_name = subdir

     

       73
       73
       +
           

     

       74
       74
       +
           elif mode == 'full':

     

       75
       75
       +
               # In full mode, we need to look at the directory structure more carefully

     

       76
       76
       +
               # For test/ directory, the structure is test/package-name/package-version/

     

       77
       77
       +
               

     

       78
       78
       +
               # First, check if the directory structure matches the expected pattern

     

       79
       79
       +
               # Look for subdirectories in the current path

     

       80
       80
       +
               try:

     

       81
       81
       +
                   subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

     

       82
       82
       +
                   

     

       83
       83
       +
                   # If we have subdirectories that might be package names

     

       84
       84
       +
                   if subdirs and not override_package_name:

     

       85
       85
       +
                       # For each subdirectory (potential package name), check if it contains version subdirectories

     

       86
       86
       +
                       for subdir in subdirs:

     

       87
       87
       +
                           version_dirs = [d for d in os.listdir(os.path.join(path, subdir)) 

     

       88
       88
       +
                                          if os.path.isdir(os.path.join(path, subdir, d))]

     

       89
       89
       +
                           

     

       90
       90
       +
                           # If this subdirectory contains potential version directories, it's likely a package

     

       91
       91
       +
                           if version_dirs:

     

       92
       92
       +
                               # We'll use the current file's path to determine which package and version it belongs to

     

       93
       93
       +
                               # We're processing files at the specific file level elsewhere, so here we just return 

     

       94
       94
       +
                               # default values which will be overridden during actual file processing

     

       95
       95
       +
                               return subdir, "unknown"

     

       96
       96
       +
                               

     

       97
       97
       +
                   # If we found no package structure or we're processing a file already in a package context

     

       98
       98
       +
                   # In this case, we'll determine package/version from the path of the file being processed

     

       99
       99
       +
                   if len(parts) >= 3:

     

       100
       100
       +
                       # Path structure might be test/package-name/version/...

     

       101
       101
       +
                       # Check if the first part is "test"

     

       102
       102
       +
                       if parts[-3] == "test" or "test" in str(p):

     

       103
       103
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       104
       104
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       105
       105
       +
                       else:

     

       106
       106
       +
                           # Standard structure: .../package-name/package-version/...

     

       107
       107
       +
                           package_name = parts[-2] if not override_package_name else package_name

     

       108
       108
       +
                           package_version = parts[-1] if not override_package_version else package_version

     

       109
       109
       +
               except (FileNotFoundError, PermissionError) as e:

     

       110
       110
       +
                   # Handle cases where we can't access the directory

     

       111
       111
       +
                   print(f"Error accessing directory {path}: {str(e)}")

     

       112
       112
       +
           

     

       113
       113
       +
           return package_name, package_version

     

       52
       114
        
       

     

       53
       115
        
       

     

       54
       116
        
       def parse_html_content(content: str) -> List[Dict[str, Any]]:

     
···

       154
       216
        
           Returns:

     

       155
       217
        
               List of dictionaries containing extracted information

     

       156
       218
        
           """

     

       157
       157
       -
           with open(file_path, 'r', encoding='utf-8') as f:

     

       219
       219
       +
           # Extract package and version from file path if not already properly set

     

       220
       220
       +
           if package_version == "unknown" or package_name == "unknown":

     

       221
       221
       +
               # Check if this file is in a test directory structure

     

       222
       222
       +
               file_path_parts = Path(file_path).resolve().parts

     

       223
       223
       +
               

     

       224
       224
       +
               # Look for test/package-name/version pattern in the path

     

       225
       225
       +
               for i, part in enumerate(file_path_parts):

     

       226
       226
       +
                   if part == "test" and i + 2 < len(file_path_parts):

     

       227
       227
       +
                       # We found a test directory, extract package name and version

     

       228
       228
       +
                       package_name = file_path_parts[i + 1]

     

       229
       229
       +
                       package_version = file_path_parts[i + 2]

     

       230
       230
       +
                       break

     

       231
       231
       +
           

     

       232
       232
       +
           try:

     

       233
       233
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       234
       234
       +
                   try:

     

       235
       235
       +
                       data = json.load(f)

     

       236
       236
       +
                   except json.JSONDecodeError:

     

       237
       237
       +
                       print(f"Error decoding JSON from {file_path}")

     

       238
       238
       +
                       return []

     

       239
       239
       +
           except UnicodeDecodeError:

     

       240
       240
       +
               # Try opening with a different encoding or with errors='ignore'

     

       158
       241
        
               try:

     

       159
       159
       -
                   data = json.load(f)

     

       160
       160
       -
               except json.JSONDecodeError:

     

       161
       161
       -
                   print(f"Error decoding JSON from {file_path}")

     

       242
       242
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       243
       243
       +
                       try:

     

       244
       244
       +
                           data = json.load(f)

     

       245
       245
       +
                       except json.JSONDecodeError:

     

       246
       246
       +
                           print(f"Error decoding JSON from {file_path} with latin-1 encoding")

     

       247
       247
       +
                           return []

     

       248
       248
       +
               except Exception as e:

     

       249
       249
       +
                   print(f"Error reading {file_path}: {str(e)}")

     

       162
       250
        
                   return []

     

       163
       251
        
           

     

       164
       252
        
           if 'content' not in data:

     
···

       193
       281
        
           return items

     

       194
       282
        
       

     

       195
       283
        
       

     

       196
       196
       -
       def process_directory(directory: str) -> List[Dict[str, Any]]:

     

       284
       284
       +
       def process_directory(directory: str, mode: str = 'full',

     

       285
       285
       +
                            override_package_name: Optional[str] = None,

     

       286
       286
       +
                            override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:

     

       197
       287
        
           """

     

       198
       288
        
           Process all JSON files in a directory recursively.

     

       199
       289
        
           

     

       200
       290
        
           Args:

     

       201
       291
        
               directory: Path to the directory containing odoc JSON files

     

       292
       292
       +
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       293
       293
       +
               override_package_name: Optional override for package name

     

       294
       294
       +
               override_package_version: Optional override for package version

     

       202
       295
        
               

     

       203
       296
        
           Returns:

     

       204
       297
        
               List of all extracted items from all files

     

       205
       298
        
           """

     

       206
       299
        
           all_items = []

     

       207
       207
       -
           package_name, package_version = extract_package_info(directory)

     

       300
       300
       +
           package_name, package_version = extract_package_info(

     

       301
       301
       +
               directory, 

     

       302
       302
       +
               mode=mode,

     

       303
       303
       +
               override_package_name=override_package_name,

     

       304
       304
       +
               override_package_version=override_package_version

     

       305
       305
       +
           )

     

       306
       306
       +
           

     

       307
       307
       +
           # First count total files to process for progress tracking

     

       308
       308
       +
           total_files = 0

     

       309
       309
       +
           for root, _, files in os.walk(directory):

     

       310
       310
       +
               for file in files:

     

       311
       311
       +
                   if file.endswith('.html.json'):

     

       312
       312
       +
                       total_files += 1

     

       313
       313
       +
           

     

       314
       314
       +
           if total_files == 0:

     

       315
       315
       +
               print(f"No .html.json files found in {directory}")

     

       316
       316
       +
               return all_items

     

       317
       317
       +
           

     

       318
       318
       +
           mode_str = f"single package mode" if mode == 'single' else "full packages mode"

     

       319
       319
       +
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       320
       320
       +
           

     

       321
       321
       +
           # Process each file with progress indicator

     

       322
       322
       +
           processed_files = 0

     

       323
       323
       +
           extracted_items = 0

     

       208
       324
        
           

     

       209
       325
        
           for root, _, files in os.walk(directory):

     

       210
       326
        
               for file in files:

     
···

       212
       328
        
                       file_path = os.path.join(root, file)

     

       213
       329
        
                       items = process_json_file(file_path, package_name, package_version)

     

       214
       330
        
                       all_items.extend(items)

     

       331
       331
       +
                       

     

       332
       332
       +
                       # Update progress

     

       333
       333
       +
                       processed_files += 1

     

       334
       334
       +
                       extracted_items += len(items)

     

       335
       335
       +
                       

     

       336
       336
       +
                       # Print progress every 100 files or on the last file

     

       337
       337
       +
                       if processed_files % 100 == 0 or processed_files == total_files:

     

       338
       338
       +
                           percent = (processed_files / total_files) * 100

     

       339
       339
       +
                           print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted", 

     

       340
       340
       +
                                 end="\r", flush=True)

     

       215
       341
        
           

     

       342
       342
       +
           print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")

     

       216
       343
        
           return all_items

     

       217
       344
        
       

     

       218
       345
        
       

     

       219
       346
        
       def main():

     

       347
       347
       +
           """

     

       348
       348
       +
           Main entry point for the script.

     

       349
       349
       +
           

     

       350
       350
       +
           Usage examples:

     

       351
       351
       +
           

     

       352
       352
       +
           # Process in full mode (multiple packages)

     

       353
       353
       +
           python odoc2json.py /path/to/odoc/output output.json

     

       354
       354
       +
           

     

       355
       355
       +
           # Process a single package with automatic detection

     

       356
       356
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single

     

       357
       357
       +
           

     

       358
       358
       +
           # Process with explicit package name and version

     

       359
       359
       +
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       360
       360
       +
           """

     

       220
       361
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       221
       362
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     

       222
       363
        
           parser.add_argument('output_file', help='Output JSON file path')

     

       223
       364
        
           parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')

     

       365
       365
       +
           parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')

     

       366
       366
       +
           parser.add_argument('--mode', choices=['full', 'single'], default='full', 

     

       367
       367
       +
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       368
       368
       +
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       369
       369
       +
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       224
       370
        
           args = parser.parse_args()

     

       371
       371
       +
           

     

       372
       372
       +
           start_time = time.time()

     

       373
       373
       +
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       225
       374
        
           

     

       226
       375
        
           # Process all files in the directory

     

       227
       227
       -
           items = process_directory(args.input_dir)

     

       376
       376
       +
           items = process_directory(

     

       377
       377
       +
               args.input_dir, 

     

       378
       378
       +
               mode=args.mode,

     

       379
       379
       +
               override_package_name=args.package_name,

     

       380
       380
       +
               override_package_version=args.package_version

     

       381
       381
       +
           )

     

       228
       382
        
           

     

       229
       383
        
           # Write the output

     

       384
       384
       +
           print(f"Writing {len(items)} items to {args.output_file}...")

     

       230
       385
        
           with open(args.output_file, 'w', encoding='utf-8') as f:

     

       231
       386
        
               if args.pretty:

     

       232
       387
        
                   json.dump(items, f, indent=2, ensure_ascii=False)

     

       233
       388
        
               else:

     

       234
       389
        
                   json.dump(items, f, ensure_ascii=False)

     

       235
       390
        
           

     

       236
       236
       -
           print(f"Processed {len(items)} items and saved to {args.output_file}")

     

       391
       391
       +
           elapsed_time = time.time() - start_time

     

       392
       392
       +
           print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")

     

       393
       393
       +
           print(f"Output saved to {args.output_file}")

     

       237
       394
        
       

     

       238
       395
        
       

     

       239
       396
        
       if __name__ == "__main__":

+127 -27

odoc2llm.py

···

       26
       26
        
       

     

       27
       27
        
       def extract_module_info(json_content):

     

       28
       28
        
           """Extract module information from odoc JSON content."""

     

       29
       29
       -
           data = json.loads(json_content)

     

       29
       29
       +
           try:

     

       30
       30
       +
               data = json.loads(json_content)

     

       31
       31
       +
           except json.JSONDecodeError as e:

     

       32
       32
       +
               print(f"JSON decode error: {e}")

     

       33
       33
       +
               # Return a minimal structure that won't cause errors downstream

     

       34
       34
       +
               return {

     

       35
       35
       +
                   "name": "Unknown",

     

       36
       36
       +
                   "type": "Module",

     

       37
       37
       +
                   "breadcrumbs": [],

     

       38
       38
       +
                   "content": BeautifulSoup("", "html.parser"),

     

       39
       39
       +
                   "preamble": ""

     

       40
       40
       +
               }

     

       30
       41
        
           

     

       31
       42
        
           # Extract module name and type from header

     

       32
       43
        
           header = data.get("header", "")

     
···

       328
       339
        
           return "\n".join(md_lines)

     

       329
       340
        
       

     

       330
       341
        
       

     

       342
       342
       +
       def read_json_file(file_path):

     

       343
       343
       +
           """

     

       344
       344
       +
           Read a JSON file with robust error handling for encoding issues.

     

       345
       345
       +
           

     

       346
       346
       +
           Args:

     

       347
       347
       +
               file_path: Path to the JSON file

     

       348
       348
       +
               

     

       349
       349
       +
           Returns:

     

       350
       350
       +
               Content of the JSON file as a string, or None if there was an error

     

       351
       351
       +
           """

     

       352
       352
       +
           # Try UTF-8 first (most common encoding)

     

       353
       353
       +
           try:

     

       354
       354
       +
               with open(file_path, 'r', encoding='utf-8') as f:

     

       355
       355
       +
                   return f.read()

     

       356
       356
       +
           except UnicodeDecodeError:

     

       357
       357
       +
               # Try other encodings if UTF-8 fails

     

       358
       358
       +
               try:

     

       359
       359
       +
                   with open(file_path, 'r', encoding='latin-1') as f:

     

       360
       360
       +
                       return f.read()

     

       361
       361
       +
               except Exception as e:

     

       362
       362
       +
                   print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)

     

       363
       363
       +
                   return None

     

       364
       364
       +
       

     

       365
       365
       +
       

     

       331
       366
        
       def build_module_hierarchy(json_files, root_dir):

     

       332
       367
        
           """Build a hierarchical structure from all the JSON files."""

     

       333
       368
        
           hierarchy = defaultdict(list)

     
···

       340
       375
        
               if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:

     

       341
       376
        
                   # For index.html.json, check if it's a module documentation

     

       342
       377
        
                   if package_parts[-1] == "index.html.json" and len(package_parts) > 1:

     

       343
       343
       -
                       try:

     

       344
       344
       -
                           with open(json_file, 'r', encoding='utf-8') as f:

     

       345
       345
       -
                               json_content = f.read()

     

       346
       346
       -
                           

     

       347
       347
       -
                           # Try to parse the module info

     

       348
       348
       -
                           module_info = extract_module_info(json_content)

     

       349
       349
       -
                           signatures = parse_module_signature(module_info["content"])

     

       350
       350
       -
                           

     

       351
       351
       -
                           # Group by package/library

     

       352
       352
       -
                           if len(package_parts) > 1:

     

       353
       353
       -
                               package_name = package_parts[0]

     

       354
       354
       -
                               hierarchy[package_name].append({

     

       378
       378
       +
                       json_content = read_json_file(json_file)

     

       379
       379
       +
                       if json_content:

     

       380
       380
       +
                           try:

     

       381
       381
       +
                               # Try to parse the module info

     

       382
       382
       +
                               module_info = extract_module_info(json_content)

     

       383
       383
       +
                               signatures = parse_module_signature(module_info["content"])

     

       384
       384
       +
                               

     

       385
       385
       +
                               # Determine package name and version from path

     

       386
       386
       +
                               package_name, package_version = determine_package_info(json_file, package_parts, module_info)

     

       387
       387
       +
                               

     

       388
       388
       +
                               # Use package name and version for the hierarchy key

     

       389
       389
       +
                               package_key = f"{package_name}"

     

       390
       390
       +
                               if package_version != "unknown":

     

       391
       391
       +
                                   # Add version information to module_info for display in markdown

     

       392
       392
       +
                                   module_info["package_version"] = package_version

     

       393
       393
       +
                               

     

       394
       394
       +
                               hierarchy[package_key].append({

     

       355
       395
        
                                   "file": json_file,

     

       356
       396
        
                                   "module_info": module_info,

     

       357
       397
        
                                   "signatures": signatures,

     

       358
       398
        
                                   "path_parts": package_parts

     

       359
       399
        
                               })

     

       360
       360
       -
                       except Exception as e:

     

       361
       361
       -
                           print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       400
       400
       +
                           except Exception as e:

     

       401
       401
       +
                               print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       362
       402
        
                   

     

       363
       403
        
                   continue

     

       364
       404
        
               

     

       365
       405
        
               # Try to parse other JSON files (non-index.html.json)

     

       366
       366
       -
               try:

     

       367
       367
       -
                   with open(json_file, 'r', encoding='utf-8') as f:

     

       368
       368
       -
                       json_content = f.read()

     

       369
       369
       -
                   

     

       370
       370
       -
                   module_info = extract_module_info(json_content)

     

       371
       371
       -
                   signatures = parse_module_signature(module_info["content"])

     

       372
       372
       -
                   

     

       373
       373
       -
                   # Group by package/library

     

       374
       374
       -
                   if len(package_parts) > 1:

     

       375
       375
       -
                       package_name = package_parts[0]

     

       406
       406
       +
               json_content = read_json_file(json_file)

     

       407
       407
       +
               if json_content:

     

       408
       408
       +
                   try:

     

       409
       409
       +
                       module_info = extract_module_info(json_content)

     

       410
       410
       +
                       signatures = parse_module_signature(module_info["content"])

     

       411
       411
       +
                       

     

       412
       412
       +
                       # Determine package name from path

     

       413
       413
       +
                       package_name = determine_package_name(package_parts, module_info)

     

       414
       414
       +
                       

     

       376
       415
        
                       hierarchy[package_name].append({

     

       377
       416
        
                           "file": json_file,

     

       378
       417
        
                           "module_info": module_info,

     

       379
       418
        
                           "signatures": signatures,

     

       380
       419
        
                           "path_parts": package_parts

     

       381
       420
        
                       })

     

       382
       382
       -
               except Exception as e:

     

       383
       383
       -
                   print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       421
       421
       +
                   except Exception as e:

     

       422
       422
       +
                       print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       384
       423
        
           

     

       385
       424
        
           return hierarchy

     

       386
       425
        
       

     

       387
       426
        
       

     

       427
       427
       +
       def determine_package_info(file_path, path_parts, module_info):

     

       428
       428
       +
           """

     

       429
       429
       +
           Determine package name and version from file path and module info.

     

       430
       430
       +
           

     

       431
       431
       +
           Args:

     

       432
       432
       +
               file_path: The full file path

     

       433
       433
       +
               path_parts: Parts of the path

     

       434
       434
       +
               module_info: Extracted module information

     

       435
       435
       +
               

     

       436
       436
       +
           Returns:

     

       437
       437
       +
               Tuple of (package_name, package_version)

     

       438
       438
       +
           """

     

       439
       439
       +
           package_name = "unknown"

     

       440
       440
       +
           package_version = "unknown"

     

       441
       441
       +
           

     

       442
       442
       +
           # Try to extract from breadcrumbs if available

     

       443
       443
       +
           if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):

     

       444
       444
       +
               for crumb in module_info["breadcrumbs"]:

     

       445
       445
       +
                   if "Library" in crumb:

     

       446
       446
       +
                       # Extract library name from the breadcrumb

     

       447
       447
       +
                       match = re.search(r'Library\s+(.+)', crumb)

     

       448
       448
       +
                       if match:

     

       449
       449
       +
                           package_name = match.group(1).strip()

     

       450
       450
       +
           

     

       451
       451
       +
           # Look for test/package-name/version pattern in the path

     

       452
       452
       +
           file_path_parts = Path(file_path).resolve().parts

     

       453
       453
       +
           for i, part in enumerate(file_path_parts):

     

       454
       454
       +
               if part == "test" and i + 2 < len(file_path_parts):

     

       455
       455
       +
                   # We found a test directory, extract package name and version

     

       456
       456
       +
                   package_name = file_path_parts[i + 1]

     

       457
       457
       +
                   package_version = file_path_parts[i + 2]

     

       458
       458
       +
                   break

     

       459
       459
       +
           

     

       460
       460
       +
           # If still unknown, fall back to using the first part of the path

     

       461
       461
       +
           if package_name == "unknown" and len(path_parts) > 0:

     

       462
       462
       +
               package_name = path_parts[0]

     

       463
       463
       +
           

     

       464
       464
       +
           # Last resort - use module name or "unknown"

     

       465
       465
       +
           if package_name == "unknown":

     

       466
       466
       +
               package_name = module_info["name"] if module_info["name"] else "unknown"

     

       467
       467
       +
           

     

       468
       468
       +
           return package_name, package_version

     

       469
       469
       +
       

     

       470
       470
       +
       

     

       388
       471
        
       def sort_modules_hierarchically(modules):

     

       389
       472
        
           """Sort modules to ensure proper hierarchical presentation."""

     

       390
       473
        
           # First sort by breadcrumb length (shorter = higher in hierarchy)

     
···

       414
       497
        
       

     

       415
       498
        
       

     

       416
       499
        
       def main():

     

       500
       500
       +
           """

     

       501
       501
       +
           Main entry point for the script.

     

       502
       502
       +
           

     

       503
       503
       +
           Usage examples:

     

       504
       504
       +
           

     

       505
       505
       +
           # Process all packages in a directory

     

       506
       506
       +
           python odoc2llm.py /path/to/odoc/output

     

       507
       507
       +
           

     

       508
       508
       +
           # Process all packages and specify output file

     

       509
       509
       +
           python odoc2llm.py /path/to/odoc/output --output documentation.md

     

       510
       510
       +
           

     

       511
       511
       +
           # Process a specific package only

     

       512
       512
       +
           python odoc2llm.py /path/to/odoc/output --package package-name

     

       513
       513
       +
           

     

       514
       514
       +
           # Enable verbose output

     

       515
       515
       +
           python odoc2llm.py /path/to/odoc/output --verbose

     

       516
       516
       +
           """

     

       417
       517
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')

     

       418
       518
        
           parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')

     

       419
       519
        
           parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')