My agentic slop goes here. Not intended for anyone else!
at main 9.3 kB view raw
1open Toru 2open Cmdliner 3 4(* Command line arguments *) 5let registry_dir = 6 let doc = "Directory containing tessera-manifests registry files" in 7 Arg.(value & opt string "~/src/git/ucam-eo/tessera-manifests/registry" & 8 info ["d"; "dir"] ~docv:"DIR" ~doc) 9 10let pattern = 11 let doc = "File pattern to match (e.g., '*2024*' for 2024 files only)" in 12 Arg.(value & opt string "*" & info ["p"; "pattern"] ~docv:"PATTERN" ~doc) 13 14let verbose = 15 let doc = "Show verbose output including file details" in 16 Arg.(value & flag & info ["v"; "verbose"] ~doc) 17 18let limit = 19 let doc = "Limit number of files to process (0 = no limit)" in 20 Arg.(value & opt int 0 & info ["l"; "limit"] ~docv:"N" ~doc) 21 22(* Helper functions *) 23let rec take n lst = 24 match n, lst with 25 | 0, _ | _, [] -> [] 26 | n, x :: xs -> x :: take (n - 1) xs 27 28let expand_tilde path = 29 if String.starts_with ~prefix:"~/" path then 30 let home = Sys.getenv "HOME" in 31 let rest = String.sub path 2 (String.length path - 2) in 32 Filename.concat home rest 33 else if String.equal path "~" then 34 Sys.getenv "HOME" 35 else path 36 37let find_registry_files dir pattern limit = 38 let expanded_dir = expand_tilde dir in 39 let cmd = if String.equal pattern "*" then 40 Printf.sprintf "find %s -type f -name '*.txt' | head -%s" 41 (Filename.quote expanded_dir) 42 (if limit > 0 then string_of_int limit else "9999") 43 else 44 Printf.sprintf "find %s -type f -name '*.txt' -name %s | head -%s" 45 (Filename.quote expanded_dir) 46 (Filename.quote pattern) 47 (if limit > 0 then string_of_int limit else "9999") in 48 49 let ic = Unix.open_process_in cmd in 50 let rec read_lines acc = 51 try 52 let line = input_line ic in 53 read_lines (line :: acc) 54 with End_of_file -> List.rev acc 55 in 56 let files = read_lines [] in 57 let _ = Unix.close_process_in ic in 58 files 59 60let humanize_bytes bytes = 61 let kb = Int64.div bytes 1024L in 62 let mb = Int64.div kb 1024L in 63 let gb = Int64.div mb 1024L in 64 if Int64.compare gb 0L > 0 then Printf.sprintf "%.1f GB" (Int64.to_float gb) 65 else if Int64.compare mb 0L > 0 then Printf.sprintf "%.1f MB" (Int64.to_float mb) 66 else if Int64.compare kb 0L > 0 then Printf.sprintf "%Ld KB" kb 67 else Printf.sprintf "%Ld B" bytes 68 69(* Statistics collection *) 70type registry_stats = { 71 total_files: int; 72 total_entries: int; 73 total_size_estimate: int64; 74 years: (string * int) list; 75 file_types: (string * int) list; 76 hash_algorithms: (string * int) list; 77} 78 79let analyze_filename filename = 80 let basename = Filename.basename filename in 81 let year = if String.contains basename '_' then 82 let parts = String.split_on_char '_' basename in 83 List.nth_opt parts 1 84 else None in 85 let extension = if String.contains basename '.' then 86 let parts = String.split_on_char '.' basename in 87 List.nth_opt parts (List.length parts - 1) 88 else None in 89 (year, extension) 90 91let count_map_incr key map = 92 let count = try List.assoc key map with Not_found -> 0 in 93 (key, count + 1) :: List.remove_assoc key map 94 95(* Main processing function *) 96let process_registries registry_dir pattern verbose limit = 97 Printf.printf "🔍 Searching for tessera-manifests registry files...\n"; 98 Printf.printf "Directory: %s\n" (expand_tilde registry_dir); 99 Printf.printf "Pattern: %s\n" pattern; 100 if limit > 0 then Printf.printf "Limit: %d files\n" limit; 101 Printf.printf "\n"; 102 103 let registry_files = find_registry_files registry_dir pattern limit in 104 105 if List.length registry_files = 0 then ( 106 Printf.printf "❌ No registry files found matching pattern '%s'\n" pattern; 107 Printf.printf "Try checking the directory path or adjusting the pattern.\n"; 108 exit 1 109 ); 110 111 Printf.printf "📋 Found %d registry files\n\n" (List.length registry_files); 112 113 let total_files = List.length registry_files in 114 115 (* Create a simple progress display *) 116 let start_time = Unix.gettimeofday () in 117 let print_progress i = 118 let pct = (float_of_int (i + 1) /. float_of_int total_files) *. 100.0 in 119 let elapsed = Unix.gettimeofday () -. start_time in 120 let eta = if i > 0 then 121 elapsed /. float_of_int (i + 1) *. float_of_int (total_files - i - 1) 122 else 0.0 in 123 let bar_width = 30 in 124 let filled = int_of_float (pct /. 100.0 *. float_of_int bar_width) in 125 let bar = String.make filled '#' ^ String.make (bar_width - filled) '-' in 126 Printf.printf "\r🚀 [%s] %.1f%% (%d/%d) ETA: %.0fs " 127 bar pct (i + 1) total_files eta; 128 flush stdout 129 in 130 131 let stats = ref { 132 total_files = 0; 133 total_entries = 0; 134 total_size_estimate = 0L; 135 years = []; 136 file_types = []; 137 hash_algorithms = []; 138 } in 139 140 (* Process each registry file *) 141 List.iteri (fun i file_path -> 142 try 143 if not verbose then print_progress i; 144 if verbose then Printf.printf "📄 Processing: %s\n" file_path; 145 146 (* Load registry file *) 147 let ic = open_in file_path in 148 let content = really_input_string ic (in_channel_length ic) in 149 close_in ic; 150 151 (* Parse registry with progress feedback if verbose *) 152 let registry = Registry.of_string ~progress:(fun current total -> 153 if verbose && current mod 100 = 0 then 154 Printf.printf "\r 📝 Parsing: %d/%d lines (%.0f%%)" 155 current total ((float_of_int current /. float_of_int total) *. 100.0) 156 ) content in 157 if verbose then Printf.printf "\r ✅ Parsed %d lines%s\n" 158 (List.length (String.split_on_char '\n' content)) (String.make 30 ' '); 159 let entries = Registry.entries registry in 160 let entry_count = List.length entries in 161 162 if verbose then ( 163 Printf.printf " └─ %d entries\n" entry_count; 164 165 (* Show a few sample entries *) 166 let sample_size = min 3 entry_count in 167 let samples = take sample_size entries in 168 List.iter (fun entry -> 169 Printf.printf " • %s (%s)\n" 170 (Registry.filename entry) 171 (Hash.to_string (Registry.hash entry)) 172 ) samples; 173 if entry_count > sample_size then 174 Printf.printf " ... and %d more\n" (entry_count - sample_size); 175 Printf.printf "\n" 176 ); 177 178 (* Collect statistics *) 179 stats := { !stats with 180 total_files = !stats.total_files + 1; 181 total_entries = !stats.total_entries + entry_count; 182 }; 183 184 (* Analyze entries for additional statistics *) 185 List.iter (fun entry -> 186 let filename = Registry.filename entry in 187 let hash = Registry.hash entry in 188 let (year, extension) = analyze_filename filename in 189 190 (* Track years *) 191 (match year with 192 | Some y -> stats := { !stats with years = count_map_incr y !stats.years } 193 | None -> ()); 194 195 (* Track file types *) 196 (match extension with 197 | Some ext -> stats := { !stats with file_types = count_map_incr ext !stats.file_types } 198 | None -> ()); 199 200 (* Track hash algorithms *) 201 let algo_str = match Hash.algorithm hash with 202 | Hash.SHA256 -> "SHA256" 203 | Hash.SHA1 -> "SHA1" 204 | Hash.MD5 -> "MD5" in 205 stats := { !stats with hash_algorithms = count_map_incr algo_str !stats.hash_algorithms }; 206 207 (* Estimate file size (very rough - assume average 1MB per file) *) 208 stats := { !stats with total_size_estimate = Int64.add !stats.total_size_estimate 1048576L }; 209 210 ) entries; 211 212 213 with exn -> 214 Printf.printf "❌ Error processing %s: %s\n" file_path (Printexc.to_string exn) 215 ) registry_files; 216 217 if not verbose then Printf.printf "\r%s\r✅ Processing complete!\n" (String.make 80 ' '); 218 219 Printf.printf "\n📊 TESSERA REGISTRY ANALYSIS SUMMARY\n"; 220 Printf.printf "=====================================\n\n"; 221 222 Printf.printf "📁 Registry Files: %d\n" !stats.total_files; 223 Printf.printf "📄 Total Data Entries: %s\n" 224 (Printf.sprintf "%d" !stats.total_entries |> 225 fun s -> String.fold_left (fun acc c -> 226 if String.length acc mod 4 = 3 then acc ^ "," ^ String.make 1 c 227 else acc ^ String.make 1 c) "" s); 228 Printf.printf "💾 Estimated Data Size: %s\n" (humanize_bytes !stats.total_size_estimate); 229 Printf.printf "\n"; 230 231 (* Show top categories *) 232 let show_top_list title items = 233 if List.length items > 0 then ( 234 Printf.printf "🏆 %s:\n" title; 235 let sorted = List.sort (fun (_, a) (_, b) -> compare b a) items in 236 let top5 = take (min 5 (List.length sorted)) sorted in 237 List.iter (fun (name, count) -> 238 Printf.printf " • %s: %d entries\n" name count 239 ) top5; 240 Printf.printf "\n" 241 ) 242 in 243 244 show_top_list "Years by Entry Count" !stats.years; 245 show_top_list "File Types" !stats.file_types; 246 show_top_list "Hash Algorithms" !stats.hash_algorithms; 247 248 Printf.printf "✨ Analysis complete!\n"; 249 0 250 251(* Command line interface *) 252let tessera_cmd = 253 let doc = "Load and analyze tessera-manifests registry files" in 254 let info = Cmd.info "tessera-loader" ~doc in 255 Cmd.v info Term.(const process_registries $ registry_dir $ pattern $ verbose $ limit) 256 257let () = 258 match Cmd.eval_value tessera_cmd with 259 | Ok (`Ok exit_code) -> exit exit_code 260 | Ok (`Version | `Help) -> exit 0 261 | Error _ -> exit 1