···
23
+
import multiprocessing as mp
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
28
+
from functools import partial
def extract_package_info(path: str, mode: str = 'full',
···
286
+
def worker_process_files(file_batch, package_name, package_version):
288
+
Worker function to process a batch of files in parallel.
291
+
file_batch: List of files to process
292
+
package_name: Name of the package
293
+
package_version: Version of the package
296
+
List of all extracted items from all files in the batch
299
+
for file_path in file_batch:
300
+
items = process_json_file(file_path, package_name, package_version)
301
+
batch_items.extend(items)
305
+
def collect_json_files(directory):
307
+
Collect all JSON files in a directory recursively.
310
+
directory: Path to the directory to search
316
+
for root, _, files in os.walk(directory):
318
+
if file.endswith('.html.json'):
319
+
json_files.append(os.path.join(root, file))
def process_directory(directory: str, mode: str = 'full',
override_package_name: Optional[str] = None,
286
-
override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:
325
+
override_package_version: Optional[str] = None,
326
+
num_workers: int = 1) -> List[Dict[str, Any]]:
288
-
Process all JSON files in a directory recursively.
328
+
Process all JSON files in a directory recursively using multiple processes.
directory: Path to the directory containing odoc JSON files
mode: Operating mode - 'full' for full packages list, 'single' for a single package
override_package_name: Optional override for package name
override_package_version: Optional override for package version
335
+
num_workers: Number of worker processes to use
List of all extracted items from all files
package_name, package_version = extract_package_info(
···
override_package_version=override_package_version
307
-
# First count total files to process for progress tracking
309
-
for root, _, files in os.walk(directory):
311
-
if file.endswith('.html.json'):
347
+
# Collect all JSON files
348
+
json_files = collect_json_files(directory)
349
+
total_files = len(json_files)
print(f"No .html.json files found in {directory}")
318
-
mode_str = f"single package mode" if mode == 'single' else "full packages mode"
355
+
mode_str = "single package mode" if mode == 'single' else "full packages mode"
print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
357
+
print(f"Using {num_workers} worker processes")
321
-
# Process each file with progress indicator
322
-
processed_files = 0
323
-
extracted_items = 0
359
+
# Split files into batches for workers
361
+
batch_size = max(1, total_files // num_workers)
362
+
for i in range(0, total_files, batch_size):
363
+
batches.append(json_files[i:i + batch_size])
325
-
for root, _, files in os.walk(directory):
327
-
if file.endswith('.html.json'):
328
-
file_path = os.path.join(root, file)
329
-
items = process_json_file(file_path, package_name, package_version)
330
-
all_items.extend(items)
333
-
processed_files += 1
334
-
extracted_items += len(items)
336
-
# Print progress every 100 files or on the last file
337
-
if processed_files % 100 == 0 or processed_files == total_files:
338
-
percent = (processed_files / total_files) * 100
339
-
print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted",
340
-
end="\r", flush=True)
365
+
# Create partial function with fixed package name and version
366
+
process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)
342
-
print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")
368
+
# Process batches in parallel
369
+
start_time = time.time()
372
+
if num_workers > 1:
373
+
# Use multiprocessing Pool
374
+
with mp.Pool(processes=num_workers) as pool:
375
+
# Submit all batches to the pool
376
+
results = pool.map(process_batch, batches)
377
+
# Collect all results
378
+
for batch_result in results:
379
+
all_items.extend(batch_result)
381
+
# Single process mode
382
+
all_items = process_batch(json_files)
384
+
elapsed_time = time.time() - start_time
385
+
print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")
386
+
print(f"Extracted {len(all_items)} items total")
···
# Process with explicit package name and version
python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
405
+
# Process with multiple cores
406
+
python odoc2json.py /path/to/odoc/output output.json --workers 8
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
···
help='Run mode: "full" for complete list of packages, "single" for a single package')
parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
417
+
parser.add_argument('--workers', type=int, default=mp.cpu_count(),
418
+
help=f'Number of worker processes (default: {mp.cpu_count()})')
args = parser.parse_args()
print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
375
-
# Process all files in the directory
424
+
# Process all files in the directory with multiple workers
items = process_directory(
override_package_name=args.package_name,
380
-
override_package_version=args.package_version
429
+
override_package_version=args.package_version,
430
+
num_workers=args.workers
···
if __name__ == "__main__":