···
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional, Tuple
28
-
def extract_package_info(path: str) -> Tuple[str, str]:
29
+
def extract_package_info(path: str, mode: str = 'full',
30
+
override_package_name: Optional[str] = None,
31
+
override_package_version: Optional[str] = None) -> Tuple[str, str]:
Extract package name and version from the path.
path: Path to the odoc output directory
37
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
38
+
override_package_name: Optional override for package name
39
+
override_package_version: Optional override for package version
Tuple of (package_name, package_version)
44
+
# Always prioritize explicit overrides if provided
45
+
if override_package_name:
46
+
package_name = override_package_name
48
+
package_name = "unknown"
50
+
if override_package_version:
51
+
package_version = override_package_version
53
+
package_version = "unknown"
55
+
# If we have both overrides, no need to analyze path
56
+
if override_package_name and override_package_version:
57
+
return package_name, package_version
# Use Path for more reliable path parsing
42
-
# If the path is in the format ".../package_name/version/..."
44
-
# The package name is typically the second-to-last component
45
-
# The version is typically the last component
46
-
return parts[-2], parts[-1]
47
-
elif len(parts) == 1:
48
-
# If only one component, assume it's the package name
49
-
return parts[0], "unknown"
51
-
return "unknown", "unknown"
63
+
if mode == 'single':
64
+
# In single package mode, the package name is typically the directory name
65
+
if not override_package_name and parts:
66
+
# Extract package name from the last part of the path
67
+
package_name = parts[-1]
69
+
# Check if there's a subdirectory in the path that seems like a package name
70
+
subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)
72
+
package_name = subdir
74
+
elif mode == 'full':
75
+
# In full mode, we need to look at the directory structure more carefully
76
+
# For test/ directory, the structure is test/package-name/package-version/
78
+
# First, check if the directory structure matches the expected pattern
79
+
# Look for subdirectories in the current path
81
+
subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
83
+
# If we have subdirectories that might be package names
84
+
if subdirs and not override_package_name:
85
+
# For each subdirectory (potential package name), check if it contains version subdirectories
86
+
for subdir in subdirs:
87
+
version_dirs = [d for d in os.listdir(os.path.join(path, subdir))
88
+
if os.path.isdir(os.path.join(path, subdir, d))]
90
+
# If this subdirectory contains potential version directories, it's likely a package
92
+
# We'll use the current file's path to determine which package and version it belongs to
93
+
# We're processing files at the specific file level elsewhere, so here we just return
94
+
# default values which will be overridden during actual file processing
95
+
return subdir, "unknown"
97
+
# If we found no package structure or we're processing a file already in a package context
98
+
# In this case, we'll determine package/version from the path of the file being processed
100
+
# Path structure might be test/package-name/version/...
101
+
# Check if the first part is "test"
102
+
if parts[-3] == "test" or "test" in str(p):
103
+
package_name = parts[-2] if not override_package_name else package_name
104
+
package_version = parts[-1] if not override_package_version else package_version
106
+
# Standard structure: .../package-name/package-version/...
107
+
package_name = parts[-2] if not override_package_name else package_name
108
+
package_version = parts[-1] if not override_package_version else package_version
109
+
except (FileNotFoundError, PermissionError) as e:
110
+
# Handle cases where we can't access the directory
111
+
print(f"Error accessing directory {path}: {str(e)}")
113
+
return package_name, package_version
def parse_html_content(content: str) -> List[Dict[str, Any]]:
···
List of dictionaries containing extracted information
157
-
with open(file_path, 'r', encoding='utf-8') as f:
219
+
# Extract package and version from file path if not already properly set
220
+
if package_version == "unknown" or package_name == "unknown":
221
+
# Check if this file is in a test directory structure
222
+
file_path_parts = Path(file_path).resolve().parts
224
+
# Look for test/package-name/version pattern in the path
225
+
for i, part in enumerate(file_path_parts):
226
+
if part == "test" and i + 2 < len(file_path_parts):
227
+
# We found a test directory, extract package name and version
228
+
package_name = file_path_parts[i + 1]
229
+
package_version = file_path_parts[i + 2]
233
+
with open(file_path, 'r', encoding='utf-8') as f:
235
+
data = json.load(f)
236
+
except json.JSONDecodeError:
237
+
print(f"Error decoding JSON from {file_path}")
239
+
except UnicodeDecodeError:
240
+
# Try opening with a different encoding or with errors='ignore'
159
-
data = json.load(f)
160
-
except json.JSONDecodeError:
161
-
print(f"Error decoding JSON from {file_path}")
242
+
with open(file_path, 'r', encoding='latin-1') as f:
244
+
data = json.load(f)
245
+
except json.JSONDecodeError:
246
+
print(f"Error decoding JSON from {file_path} with latin-1 encoding")
248
+
except Exception as e:
249
+
print(f"Error reading {file_path}: {str(e)}")
if 'content' not in data:
···
196
-
def process_directory(directory: str) -> List[Dict[str, Any]]:
284
+
def process_directory(directory: str, mode: str = 'full',
285
+
override_package_name: Optional[str] = None,
286
+
override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:
Process all JSON files in a directory recursively.
directory: Path to the directory containing odoc JSON files
292
+
mode: Operating mode - 'full' for full packages list, 'single' for a single package
293
+
override_package_name: Optional override for package name
294
+
override_package_version: Optional override for package version
List of all extracted items from all files
207
-
package_name, package_version = extract_package_info(directory)
300
+
package_name, package_version = extract_package_info(
303
+
override_package_name=override_package_name,
304
+
override_package_version=override_package_version
307
+
# First count total files to process for progress tracking
309
+
for root, _, files in os.walk(directory):
311
+
if file.endswith('.html.json'):
314
+
if total_files == 0:
315
+
print(f"No .html.json files found in {directory}")
318
+
mode_str = f"single package mode" if mode == 'single' else "full packages mode"
319
+
print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
321
+
# Process each file with progress indicator
322
+
processed_files = 0
323
+
extracted_items = 0
for root, _, files in os.walk(directory):
···
file_path = os.path.join(root, file)
items = process_json_file(file_path, package_name, package_version)
333
+
processed_files += 1
334
+
extracted_items += len(items)
336
+
# Print progress every 100 files or on the last file
337
+
if processed_files % 100 == 0 or processed_files == total_files:
338
+
percent = (processed_files / total_files) * 100
339
+
print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted",
340
+
end="\r", flush=True)
342
+
print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")
348
+
Main entry point for the script.
352
+
# Process in full mode (multiple packages)
353
+
python odoc2json.py /path/to/odoc/output output.json
355
+
# Process a single package with automatic detection
356
+
python odoc2json.py /path/to/odoc/package output.json --mode single
358
+
# Process with explicit package name and version
359
+
python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
parser.add_argument('input_dir', help='Directory containing odoc JSON output')
parser.add_argument('output_file', help='Output JSON file path')
parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
365
+
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
366
+
parser.add_argument('--mode', choices=['full', 'single'], default='full',
367
+
help='Run mode: "full" for complete list of packages, "single" for a single package')
368
+
parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
369
+
parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
args = parser.parse_args()
372
+
start_time = time.time()
373
+
print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
# Process all files in the directory
227
-
items = process_directory(args.input_dir)
376
+
items = process_directory(
379
+
override_package_name=args.package_name,
380
+
override_package_version=args.package_version
384
+
print(f"Writing {len(items)} items to {args.output_file}...")
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(items, f, indent=2, ensure_ascii=False)
json.dump(items, f, ensure_ascii=False)
236
-
print(f"Processed {len(items)} items and saved to {args.output_file}")
391
+
elapsed_time = time.time() - start_time
392
+
print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")
393
+
print(f"Output saved to {args.output_file}")
if __name__ == "__main__":