commit 94ea53a2611ac92d63c959fa39582f5cd6f59cc0 · anil.recoil.org/odoc-mcp

+84 -34
odoc2json.py
···

       20
       20
        
       import json

     

       21
       21
        
       import re

     

       22
       22
        
       import time

     

       23
       23
       +
       import multiprocessing as mp

     

       23
       24
        
       from bs4 import BeautifulSoup

     

       24
       25
        
       from typing import Dict, List, Any, Optional, Tuple

     

       25
       26
        
       import argparse

     

       26
       27
        
       from pathlib import Path

     

       28
       28
       +
       from functools import partial

     

       27
       29
        
       

     

       28
       30
        
       

     

       29
       31
        
       def extract_package_info(path: str, mode: str = 'full', 

     
···

       281
       283
        
           return items

     

       282
       284
        
       

     

       283
       285
        
       

     

       286
       286
       +
       def worker_process_files(file_batch, package_name, package_version):

     

       287
       287
       +
           """

     

       288
       288
       +
           Worker function to process a batch of files in parallel.

     

       289
       289
       +
           

     

       290
       290
       +
           Args:

     

       291
       291
       +
               file_batch: List of files to process

     

       292
       292
       +
               package_name: Name of the package

     

       293
       293
       +
               package_version: Version of the package

     

       294
       294
       +
               

     

       295
       295
       +
           Returns:

     

       296
       296
       +
               List of all extracted items from all files in the batch

     

       297
       297
       +
           """

     

       298
       298
       +
           batch_items = []

     

       299
       299
       +
           for file_path in file_batch:

     

       300
       300
       +
               items = process_json_file(file_path, package_name, package_version)

     

       301
       301
       +
               batch_items.extend(items)

     

       302
       302
       +
           return batch_items

     

       303
       303
       +
       

     

       304
       304
       +
       

     

       305
       305
       +
       def collect_json_files(directory):

     

       306
       306
       +
           """

     

       307
       307
       +
           Collect all JSON files in a directory recursively.

     

       308
       308
       +
           

     

       309
       309
       +
           Args:

     

       310
       310
       +
               directory: Path to the directory to search

     

       311
       311
       +
               

     

       312
       312
       +
           Returns:

     

       313
       313
       +
               List of file paths

     

       314
       314
       +
           """

     

       315
       315
       +
           json_files = []

     

       316
       316
       +
           for root, _, files in os.walk(directory):

     

       317
       317
       +
               for file in files:

     

       318
       318
       +
                   if file.endswith('.html.json'):

     

       319
       319
       +
                       json_files.append(os.path.join(root, file))

     

       320
       320
       +
           return json_files

     

       321
       321
       +
       

     

       322
       322
       +
       

     

       284
       323
        
       def process_directory(directory: str, mode: str = 'full',

     

       285
       324
        
                            override_package_name: Optional[str] = None,

     

       286
       286
       -
                            override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:

     

       325
       325
       +
                            override_package_version: Optional[str] = None,

     

       326
       326
       +
                            num_workers: int = 1) -> List[Dict[str, Any]]:

     

       287
       327
        
           """

     

       288
       288
       -
           Process all JSON files in a directory recursively.

     

       328
       328
       +
           Process all JSON files in a directory recursively using multiple processes.

     

       289
       329
        
           

     

       290
       330
        
           Args:

     

       291
       331
        
               directory: Path to the directory containing odoc JSON files

     

       292
       332
        
               mode: Operating mode - 'full' for full packages list, 'single' for a single package

     

       293
       333
        
               override_package_name: Optional override for package name

     

       294
       334
        
               override_package_version: Optional override for package version

     

       335
       335
       +
               num_workers: Number of worker processes to use

     

       295
       336
        
               

     

       296
       337
        
           Returns:

     

       297
       338
        
               List of all extracted items from all files

     

       298
       339
        
           """

     

       299
       299
       -
           all_items = []

     

       300
       340
        
           package_name, package_version = extract_package_info(

     

       301
       341
        
               directory, 

     

       302
       342
        
               mode=mode,

     
···

       304
       344
        
               override_package_version=override_package_version

     

       305
       345
        
           )

     

       306
       346
        
           

     

       307
       307
       -
           # First count total files to process for progress tracking

     

       308
       308
       -
           total_files = 0

     

       309
       309
       -
           for root, _, files in os.walk(directory):

     

       310
       310
       -
               for file in files:

     

       311
       311
       -
                   if file.endswith('.html.json'):

     

       312
       312
       -
                       total_files += 1

     

       347
       347
       +
           # Collect all JSON files

     

       348
       348
       +
           json_files = collect_json_files(directory)

     

       349
       349
       +
           total_files = len(json_files)

     

       313
       350
        
           

     

       314
       351
        
           if total_files == 0:

     

       315
       352
        
               print(f"No .html.json files found in {directory}")

     

       316
       316
       -
               return all_items

     

       353
       353
       +
               return []

     

       317
       354
        
           

     

       318
       318
       -
           mode_str = f"single package mode" if mode == 'single' else "full packages mode"

     

       355
       355
       +
           mode_str = "single package mode" if mode == 'single' else "full packages mode"

     

       319
       356
        
           print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")

     

       357
       357
       +
           print(f"Using {num_workers} worker processes")

     

       320
       358
        
           

     

       321
       321
       -
           # Process each file with progress indicator

     

       322
       322
       -
           processed_files = 0

     

       323
       323
       -
           extracted_items = 0

     

       359
       359
       +
           # Split files into batches for workers

     

       360
       360
       +
           batches = []

     

       361
       361
       +
           batch_size = max(1, total_files // num_workers)

     

       362
       362
       +
           for i in range(0, total_files, batch_size):

     

       363
       363
       +
               batches.append(json_files[i:i + batch_size])

     

       324
       364
        
           

     

       325
       325
       -
           for root, _, files in os.walk(directory):

     

       326
       326
       -
               for file in files:

     

       327
       327
       -
                   if file.endswith('.html.json'):

     

       328
       328
       -
                       file_path = os.path.join(root, file)

     

       329
       329
       -
                       items = process_json_file(file_path, package_name, package_version)

     

       330
       330
       -
                       all_items.extend(items)

     

       331
       331
       -
                       

     

       332
       332
       -
                       # Update progress

     

       333
       333
       -
                       processed_files += 1

     

       334
       334
       -
                       extracted_items += len(items)

     

       335
       335
       -
                       

     

       336
       336
       -
                       # Print progress every 100 files or on the last file

     

       337
       337
       -
                       if processed_files % 100 == 0 or processed_files == total_files:

     

       338
       338
       -
                           percent = (processed_files / total_files) * 100

     

       339
       339
       -
                           print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted", 

     

       340
       340
       -
                                 end="\r", flush=True)

     

       365
       365
       +
           # Create partial function with fixed package name and version

     

       366
       366
       +
           process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)

     

       341
       367
        
           

     

       342
       342
       -
           print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")

     

       368
       368
       +
           # Process batches in parallel

     

       369
       369
       +
           start_time = time.time()

     

       370
       370
       +
           all_items = []

     

       371
       371
       +
           

     

       372
       372
       +
           if num_workers > 1:

     

       373
       373
       +
               # Use multiprocessing Pool

     

       374
       374
       +
               with mp.Pool(processes=num_workers) as pool:

     

       375
       375
       +
                   # Submit all batches to the pool

     

       376
       376
       +
                   results = pool.map(process_batch, batches)

     

       377
       377
       +
                   # Collect all results

     

       378
       378
       +
                   for batch_result in results:

     

       379
       379
       +
                       all_items.extend(batch_result)

     

       380
       380
       +
           else:

     

       381
       381
       +
               # Single process mode

     

       382
       382
       +
               all_items = process_batch(json_files)

     

       383
       383
       +
           

     

       384
       384
       +
           elapsed_time = time.time() - start_time

     

       385
       385
       +
           print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")

     

       386
       386
       +
           print(f"Extracted {len(all_items)} items total")

     

       343
       387
        
           return all_items

     

       344
       388
        
       

     

       345
       389
        
       

     
···

       357
       401
        
           

     

       358
       402
        
           # Process with explicit package name and version

     

       359
       403
        
           python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0

     

       404
       404
       +
           

     

       405
       405
       +
           # Process with multiple cores

     

       406
       406
       +
           python odoc2json.py /path/to/odoc/output output.json --workers 8

     

       360
       407
        
           """

     

       361
       408
        
           parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')

     

       362
       409
        
           parser.add_argument('input_dir', help='Directory containing odoc JSON output')

     
···

       367
       414
        
                               help='Run mode: "full" for complete list of packages, "single" for a single package')

     

       368
       415
        
           parser.add_argument('--package-name', help='Override the package name (useful in single mode)')

     

       369
       416
        
           parser.add_argument('--package-version', help='Override the package version (useful in single mode)')

     

       417
       417
       +
           parser.add_argument('--workers', type=int, default=mp.cpu_count(), 

     

       418
       418
       +
                               help=f'Number of worker processes (default: {mp.cpu_count()})')

     

       370
       419
        
           args = parser.parse_args()

     

       371
       420
        
           

     

       372
       421
        
           start_time = time.time()

     

       373
       422
        
           print(f"Starting extraction from {args.input_dir} in {args.mode} mode")

     

       374
       423
        
           

     

       375
       375
       -
           # Process all files in the directory

     

       424
       424
       +
           # Process all files in the directory with multiple workers

     

       376
       425
        
           items = process_directory(

     

       377
       426
        
               args.input_dir, 

     

       378
       427
        
               mode=args.mode,

     

       379
       428
        
               override_package_name=args.package_name,

     

       380
       380
       -
               override_package_version=args.package_version

     

       429
       429
       +
               override_package_version=args.package_version,

     

       430
       430
       +
               num_workers=args.workers

     

       381
       431
        
           )

     

       382
       432
        
           

     

       383
       433
        
           # Write the output

     
···

       394
       444
        
       

     

       395
       445
        
       

     

       396
       446
        
       if __name__ == "__main__":

     

       397
       397
       -
           main()

     

       447
       447
       +
           main()