···
23
+
import multiprocessing as mp
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
28
+
from functools import partial
28
-
def extract_package_info(path: str) -> Tuple[str, str]:
31
+
def extract_package_info(path: str, mode: str = 'full',
32
+
override_package_name: Optional[str] = None,
33
+
override_package_version: Optional[str] = None) -> Tuple[str, str]:
Extract package name and version from the path.
path: Path to the odoc output directory
39
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
40
+
override_package_name: Optional override for package name
41
+
override_package_version: Optional override for package version
Tuple of (package_name, package_version)
46
+
# Always prioritize explicit overrides if provided
47
+
if override_package_name:
48
+
package_name = override_package_name
50
+
package_name = "unknown"
52
+
if override_package_version:
53
+
package_version = override_package_version
55
+
package_version = "unknown"
57
+
# If we have both overrides, no need to analyze path
58
+
if override_package_name and override_package_version:
59
+
return package_name, package_version
# Use Path for more reliable path parsing
42
-
# If the path is in the format ".../package_name/version/..."
44
-
# The package name is typically the second-to-last component
45
-
# The version is typically the last component
46
-
return parts[-2], parts[-1]
47
-
elif len(parts) == 1:
48
-
# If only one component, assume it's the package name
49
-
return parts[0], "unknown"
51
-
return "unknown", "unknown"
65
+
if mode == 'single':
66
+
# In single package mode, the package name is typically the directory name
67
+
if not override_package_name and parts:
68
+
# Extract package name from the last part of the path
69
+
package_name = parts[-1]
71
+
# Check if there's a subdirectory in the path that seems like a package name
72
+
subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)
74
+
package_name = subdir
76
+
elif mode == 'full':
77
+
# In full mode, we need to look at the directory structure more carefully
78
+
# For test/ directory, the structure is test/package-name/package-version/
80
+
# First, check if the directory structure matches the expected pattern
81
+
# Look for subdirectories in the current path
83
+
subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
85
+
# If we have subdirectories that might be package names
86
+
if subdirs and not override_package_name:
87
+
# For each subdirectory (potential package name), check if it contains version subdirectories
88
+
for subdir in subdirs:
89
+
version_dirs = [d for d in os.listdir(os.path.join(path, subdir))
90
+
if os.path.isdir(os.path.join(path, subdir, d))]
92
+
# If this subdirectory contains potential version directories, it's likely a package
94
+
# We'll use the current file's path to determine which package and version it belongs to
95
+
# We're processing files at the specific file level elsewhere, so here we just return
96
+
# default values which will be overridden during actual file processing
97
+
return subdir, "unknown"
99
+
# If we found no package structure or we're processing a file already in a package context
100
+
# In this case, we'll determine package/version from the path of the file being processed
101
+
if len(parts) >= 3:
102
+
# Path structure might be test/package-name/version/...
103
+
# Check if the first part is "test"
104
+
if parts[-3] == "test" or "test" in str(p):
105
+
package_name = parts[-2] if not override_package_name else package_name
106
+
package_version = parts[-1] if not override_package_version else package_version
108
+
# Standard structure: .../package-name/package-version/...
109
+
package_name = parts[-2] if not override_package_name else package_name
110
+
package_version = parts[-1] if not override_package_version else package_version
111
+
except (FileNotFoundError, PermissionError) as e:
112
+
# Handle cases where we can't access the directory
113
+
print(f"Error accessing directory {path}: {str(e)}")
115
+
return package_name, package_version
def parse_html_content(content: str) -> List[Dict[str, Any]]:
···
List of dictionaries containing extracted information
157
-
with open(file_path, 'r', encoding='utf-8') as f:
221
+
# Extract package and version from file path if not already properly set
222
+
if package_version == "unknown" or package_name == "unknown":
223
+
# Check if this file is in a test directory structure
224
+
file_path_parts = Path(file_path).resolve().parts
226
+
# Look for test/package-name/version pattern in the path
227
+
for i, part in enumerate(file_path_parts):
228
+
if part == "test" and i + 2 < len(file_path_parts):
229
+
# We found a test directory, extract package name and version
230
+
package_name = file_path_parts[i + 1]
231
+
package_version = file_path_parts[i + 2]
235
+
with open(file_path, 'r', encoding='utf-8') as f:
237
+
data = json.load(f)
238
+
except json.JSONDecodeError:
239
+
print(f"Error decoding JSON from {file_path}")
241
+
except UnicodeDecodeError:
242
+
# Try opening with a different encoding or with errors='ignore'
159
-
data = json.load(f)
160
-
except json.JSONDecodeError:
161
-
print(f"Error decoding JSON from {file_path}")
244
+
with open(file_path, 'r', encoding='latin-1') as f:
246
+
data = json.load(f)
247
+
except json.JSONDecodeError:
248
+
print(f"Error decoding JSON from {file_path} with latin-1 encoding")
250
+
except Exception as e:
251
+
print(f"Error reading {file_path}: {str(e)}")
if 'content' not in data:
···
196
-
def process_directory(directory: str) -> List[Dict[str, Any]]:
286
+
def worker_process_files(file_batch, package_name, package_version):
288
+
Worker function to process a batch of files in parallel.
291
+
file_batch: List of files to process
292
+
package_name: Name of the package
293
+
package_version: Version of the package
296
+
List of all extracted items from all files in the batch
299
+
for file_path in file_batch:
300
+
items = process_json_file(file_path, package_name, package_version)
301
+
batch_items.extend(items)
305
+
def collect_json_files(directory):
307
+
Collect all JSON files in a directory recursively.
310
+
directory: Path to the directory to search
316
+
for root, _, files in os.walk(directory):
318
+
if file.endswith('.html.json'):
319
+
json_files.append(os.path.join(root, file))
323
+
def process_directory(directory: str, mode: str = 'full',
324
+
override_package_name: Optional[str] = None,
325
+
override_package_version: Optional[str] = None,
326
+
num_workers: int = 1) -> List[Dict[str, Any]]:
198
-
Process all JSON files in a directory recursively.
328
+
Process all JSON files in a directory recursively using multiple processes.
directory: Path to the directory containing odoc JSON files
332
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
333
+
override_package_name: Optional override for package name
334
+
override_package_version: Optional override for package version
335
+
num_workers: Number of worker processes to use
List of all extracted items from all files
340
+
package_name, package_version = extract_package_info(
343
+
override_package_name=override_package_name,
344
+
override_package_version=override_package_version
347
+
# Collect all JSON files
348
+
json_files = collect_json_files(directory)
349
+
total_files = len(json_files)
351
+
if total_files == 0:
352
+
print(f"No .html.json files found in {directory}")
355
+
mode_str = "single package mode" if mode == 'single' else "full packages mode"
356
+
print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
357
+
print(f"Using {num_workers} worker processes")
359
+
# Split files into batches for workers
361
+
batch_size = max(1, total_files // num_workers)
362
+
for i in range(0, total_files, batch_size):
363
+
batches.append(json_files[i:i + batch_size])
365
+
# Create partial function with fixed package name and version
366
+
process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)
368
+
# Process batches in parallel
369
+
start_time = time.time()
207
-
package_name, package_version = extract_package_info(directory)
209
-
for root, _, files in os.walk(directory):
211
-
if file.endswith('.html.json'):
212
-
file_path = os.path.join(root, file)
213
-
items = process_json_file(file_path, package_name, package_version)
214
-
all_items.extend(items)
372
+
if num_workers > 1:
373
+
# Use multiprocessing Pool
374
+
with mp.Pool(processes=num_workers) as pool:
375
+
# Submit all batches to the pool
376
+
results = pool.map(process_batch, batches)
377
+
# Collect all results
378
+
for batch_result in results:
379
+
all_items.extend(batch_result)
381
+
# Single process mode
382
+
all_items = process_batch(json_files)
384
+
elapsed_time = time.time() - start_time
385
+
print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")
386
+
print(f"Extracted {len(all_items)} items total")
392
+
Main entry point for the script.
396
+
# Process in full mode (multiple packages)
397
+
python odoc2json.py /path/to/odoc/output output.json
399
+
# Process a single package with automatic detection
400
+
python odoc2json.py /path/to/odoc/package output.json --mode single
402
+
# Process with explicit package name and version
403
+
python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
405
+
# Process with multiple cores
406
+
python odoc2json.py /path/to/odoc/output output.json --workers 8
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
parser.add_argument('output_file', help='Output JSON file path')
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
412
+
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
413
+
parser.add_argument('--mode', choices=['full', 'single'], default='full',
414
+
help='Run mode: "full" for complete list of packages, "single" for a single package')
415
+
parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
416
+
parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
417
+
parser.add_argument('--workers', type=int, default=mp.cpu_count(),
418
+
help=f'Number of worker processes (default: {mp.cpu_count()})')
args = parser.parse_args()
226
-
# Process all files in the directory
227
-
items = process_directory(args.input_dir)
421
+
start_time = time.time()
422
+
print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
424
+
# Process all files in the directory with multiple workers
425
+
items = process_directory(
428
+
override_package_name=args.package_name,
429
+
override_package_version=args.package_version,
430
+
num_workers=args.workers
434
+
print(f"Writing {len(items)} items to {args.output_file}...")
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(items, f, indent=2, ensure_ascii=False)
json.dump(items, f, ensure_ascii=False)
236
-
print(f"Processed {len(items)} items and saved to {args.output_file}")
441
+
elapsed_time = time.time() - start_time
442
+
print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")
443
+
print(f"Output saved to {args.output_file}")
if __name__ == "__main__":