commit 94ea53a2611ac92d63c959fa39582f5cd6f59cc0 · anil.recoil.org/odoc-mcp

+84 -34

odoc2json.py

···

       20
        
       import json

     

       21
        
       import re

     

       22
        
       import time

     

       0
        
       
     

       23
        
       from bs4 import BeautifulSoup

     

       24
        
       from typing import Dict, List, Any, Optional, Tuple

     

       25
        
       import argparse

     

       26
        
       from pathlib import Path

     

       0
        
       
     

       27
        
       

     

       28
        
       

     

       29
        
       def extract_package_info(path: str, mode: str = 'full', 

     
···

       281
        
           return items

     

       282
        
       

     

       283
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       284
        
       def process_directory(directory: str, mode: str = 'full',

     

       285
        
                            override_package_name: Optional[str] = None,

     

       286
       -
                            override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:

     

       0
        
       
     

       287
        
           """

     

       288
       -
           Process all JSON files in a directory recursively.

     

       289
        
           

     

       290
        
           Args:

     

       291
        
               directory: Path to the directory containing odoc JSON files

     

       292
        
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       293
        
               override_package_name: Optional override for package name

     

       294
        
               override_package_version: Optional override for package version

     

       0
        
       
     

       295
        
               

     

       296
        
           Returns:

     

       297
        
               List of all extracted items from all files

     

       298
        
           """

     

       299
       -
           all_items = []

     

       300
        
           package_name, package_version = extract_package_info(

     

       301
        
               directory, 

     

       302
        
               mode=mode,

     
···

       304
        
               override_package_version=override_package_version

     

       305
        
           )

     

       306
        
           

     

       307
       -
           # First count total files to process for progress tracking

     

       308
       -
           total_files = 0

     

       309
       -
           for root, _, files in os.walk(directory):

     

       310
       -
               for file in files:

     

       311
       -
                   if file.endswith('.html.json'):

     

       312
       -
                       total_files += 1

     

       313
        
           

     

       314
        
           if total_files == 0:

     

       315
        
               print(f"No .html.json files found in {directory}")

     

       316
       -
               return all_items

     

       317
        
           

     

       318
       -
           mode_str = f"single package mode" if mode == 'single' else "full packages mode"

     

       319
        
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       0
        
       
     

       320
        
           

     

       321
       -
           # Process each file with progress indicator

     

       322
       -
           processed_files = 0

     

       323
       -
           extracted_items = 0

     

       0
        
       
     

       0
        
       
     

       324
        
           

     

       325
       -
           for root, _, files in os.walk(directory):

     

       326
       -
               for file in files:

     

       327
       -
                   if file.endswith('.html.json'):

     

       328
       -
                       file_path = os.path.join(root, file)

     

       329
       -
                       items = process_json_file(file_path, package_name, package_version)

     

       330
       -
                       all_items.extend(items)

     

       331
       -
                       

     

       332
       -
                       # Update progress

     

       333
       -
                       processed_files += 1

     

       334
       -
                       extracted_items += len(items)

     

       335
       -
                       

     

       336
       -
                       # Print progress every 100 files or on the last file

     

       337
       -
                       if processed_files % 100 == 0 or processed_files == total_files:

     

       338
       -
                           percent = (processed_files / total_files) * 100

     

       339
       -
                           print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted", 

     

       340
       -
                                 end="\r", flush=True)

     

       341
        
           

     

       342
       -
           print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       343
        
           return all_items

     

       344
        
       

     

       345
        
       

     
···

       357
        
           

     

       358
        
           # Process with explicit package name and version

     

       359
        
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       360
        
           """

     

       361
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       362
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     
···

       367
        
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       368
        
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       369
        
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       0
        
       
     

       0
        
       
     

       370
        
           args = parser.parse_args()

     

       371
        
           

     

       372
        
           start_time = time.time()

     

       373
        
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       374
        
           

     

       375
       -
           # Process all files in the directory

     

       376
        
           items = process_directory(

     

       377
        
               args.input_dir, 

     

       378
        
               mode=args.mode,

     

       379
        
               override_package_name=args.package_name,

     

       380
       -
               override_package_version=args.package_version

     

       0
        
       
     

       381
        
           )

     

       382
        
           

     

       383
        
           # Write the output

     
···

       394
        
       

     

       395
        
       

     

       396
        
       if __name__ == "__main__":

     

       397
       -
           main()

···

       20
        
       import json

     

       21
        
       import re

     

       22
        
       import time

     

       23
       +
       import multiprocessing as mp

     

       24
        
       from bs4 import BeautifulSoup

     

       25
        
       from typing import Dict, List, Any, Optional, Tuple

     

       26
        
       import argparse

     

       27
        
       from pathlib import Path

     

       28
       +
       from functools import partial

     

       29
        
       

     

       30
        
       

     

       31
        
       def extract_package_info(path: str, mode: str = 'full', 

     
···

       283
        
           return items

     

       284
        
       

     

       285
        
       

     

       286
       +
       def worker_process_files(file_batch, package_name, package_version):

     

       287
       +
           """

     

       288
       +
           Worker function to process a batch of files in parallel.

     

       289
       +
           

     

       290
       +
           Args:

     

       291
       +
               file_batch: List of files to process

     

       292
       +
               package_name: Name of the package

     

       293
       +
               package_version: Version of the package

     

       294
       +
               

     

       295
       +
           Returns:

     

       296
       +
               List of all extracted items from all files in the batch

     

       297
       +
           """

     

       298
       +
           batch_items = []

     

       299
       +
           for file_path in file_batch:

     

       300
       +
               items = process_json_file(file_path, package_name, package_version)

     

       301
       +
               batch_items.extend(items)

     

       302
       +
           return batch_items

     

       303
       +
       

     

       304
       +
       

     

       305
       +
       def collect_json_files(directory):

     

       306
       +
           """

     

       307
       +
           Collect all JSON files in a directory recursively.

     

       308
       +
           

     

       309
       +
           Args:

     

       310
       +
               directory: Path to the directory to search

     

       311
       +
               

     

       312
       +
           Returns:

     

       313
       +
               List of file paths

     

       314
       +
           """

     

       315
       +
           json_files = []

     

       316
       +
           for root, _, files in os.walk(directory):

     

       317
       +
               for file in files:

     

       318
       +
                   if file.endswith('.html.json'):

     

       319
       +
                       json_files.append(os.path.join(root, file))

     

       320
       +
           return json_files

     

       321
       +
       

     

       322
       +
       

     

       323
        
       def process_directory(directory: str, mode: str = 'full',

     

       324
        
                            override_package_name: Optional[str] = None,

     

       325
       +
                            override_package_version: Optional[str] = None,

     

       326
       +
                            num_workers: int = 1) -> List[Dict[str, Any]]:

     

       327
        
           """

     

       328
       +
           Process all JSON files in a directory recursively using multiple processes.

     

       329
        
           

     

       330
        
           Args:

     

       331
        
               directory: Path to the directory containing odoc JSON files

     

       332
        
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       333
        
               override_package_name: Optional override for package name

     

       334
        
               override_package_version: Optional override for package version

     

       335
       +
               num_workers: Number of worker processes to use

     

       336
        
               

     

       337
        
           Returns:

     

       338
        
               List of all extracted items from all files

     

       339
        
           """

     

       0
        
       
     

       340
        
           package_name, package_version = extract_package_info(

     

       341
        
               directory, 

     

       342
        
               mode=mode,

     
···

       344
        
               override_package_version=override_package_version

     

       345
        
           )

     

       346
        
           

     

       347
       +
           # Collect all JSON files

     

       348
       +
           json_files = collect_json_files(directory)

     

       349
       +
           total_files = len(json_files)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       350
        
           

     

       351
        
           if total_files == 0:

     

       352
        
               print(f"No .html.json files found in {directory}")

     

       353
       +
               return []

     

       354
        
           

     

       355
       +
           mode_str = "single package mode" if mode == 'single' else "full packages mode"

     

       356
        
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       357
       +
           print(f"Using {num_workers} worker processes")

     

       358
        
           

     

       359
       +
           # Split files into batches for workers

     

       360
       +
           batches = []

     

       361
       +
           batch_size = max(1, total_files // num_workers)

     

       362
       +
           for i in range(0, total_files, batch_size):

     

       363
       +
               batches.append(json_files[i:i + batch_size])

     

       364
        
           

     

       365
       +
           # Create partial function with fixed package name and version

     

       366
       +
           process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       367
        
           

     

       368
       +
           # Process batches in parallel

     

       369
       +
           start_time = time.time()

     

       370
       +
           all_items = []

     

       371
       +
           

     

       372
       +
           if num_workers > 1:

     

       373
       +
               # Use multiprocessing Pool

     

       374
       +
               with mp.Pool(processes=num_workers) as pool:

     

       375
       +
                   # Submit all batches to the pool

     

       376
       +
                   results = pool.map(process_batch, batches)

     

       377
       +
                   # Collect all results

     

       378
       +
                   for batch_result in results:

     

       379
       +
                       all_items.extend(batch_result)

     

       380
       +
           else:

     

       381
       +
               # Single process mode

     

       382
       +
               all_items = process_batch(json_files)

     

       383
       +
           

     

       384
       +
           elapsed_time = time.time() - start_time

     

       385
       +
           print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")

     

       386
       +
           print(f"Extracted {len(all_items)} items total")

     

       387
        
           return all_items

     

       388
        
       

     

       389
        
       

     
···

       401
        
           

     

       402
        
           # Process with explicit package name and version

     

       403
        
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       404
       +
           

     

       405
       +
           # Process with multiple cores

     

       406
       +
           python odoc2json.py /path/to/odoc/output output.json --workers 8

     

       407
        
           """

     

       408
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       409
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     
···

       414
        
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       415
        
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       416
        
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       417
       +
           parser.add_argument('--workers', type=int, default=mp.cpu_count(), 

     

       418
       +
                               help=f'Number of worker processes (default: {mp.cpu_count()})')

     

       419
        
           args = parser.parse_args()

     

       420
        
           

     

       421
        
           start_time = time.time()

     

       422
        
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       423
        
           

     

       424
       +
           # Process all files in the directory with multiple workers

     

       425
        
           items = process_directory(

     

       426
        
               args.input_dir, 

     

       427
        
               mode=args.mode,

     

       428
        
               override_package_name=args.package_name,

     

       429
       +
               override_package_version=args.package_version,

     

       430
       +
               num_workers=args.workers

     

       431
        
           )

     

       432
        
           

     

       433
        
           # Write the output

     
···

       444
        
       

     

       445
        
       

     

       446
        
       if __name__ == "__main__":

     

       447
       +
           main()