My agentic slop goes here. Not intended for anyone else!
1open Toru
2open Cmdliner
3
4(* Command line arguments *)
5let registry_dir =
6 let doc = "Directory containing tessera-manifests registry files" in
7 Arg.(value & opt string "~/src/git/ucam-eo/tessera-manifests/registry" &
8 info ["d"; "dir"] ~docv:"DIR" ~doc)
9
10let pattern =
11 let doc = "File pattern to match (e.g., '*2024*' for 2024 files only)" in
12 Arg.(value & opt string "*" & info ["p"; "pattern"] ~docv:"PATTERN" ~doc)
13
14let verbose =
15 let doc = "Show verbose output including file details" in
16 Arg.(value & flag & info ["v"; "verbose"] ~doc)
17
18let limit =
19 let doc = "Limit number of files to process (0 = no limit)" in
20 Arg.(value & opt int 0 & info ["l"; "limit"] ~docv:"N" ~doc)
21
22(* Helper functions *)
23let rec take n lst =
24 match n, lst with
25 | 0, _ | _, [] -> []
26 | n, x :: xs -> x :: take (n - 1) xs
27
28let expand_tilde path =
29 if String.starts_with ~prefix:"~/" path then
30 let home = Sys.getenv "HOME" in
31 let rest = String.sub path 2 (String.length path - 2) in
32 Filename.concat home rest
33 else if String.equal path "~" then
34 Sys.getenv "HOME"
35 else path
36
37let find_registry_files dir pattern limit =
38 let expanded_dir = expand_tilde dir in
39 let cmd = if String.equal pattern "*" then
40 Printf.sprintf "find %s -type f -name '*.txt' | head -%s"
41 (Filename.quote expanded_dir)
42 (if limit > 0 then string_of_int limit else "9999")
43 else
44 Printf.sprintf "find %s -type f -name '*.txt' -name %s | head -%s"
45 (Filename.quote expanded_dir)
46 (Filename.quote pattern)
47 (if limit > 0 then string_of_int limit else "9999") in
48
49 let ic = Unix.open_process_in cmd in
50 let rec read_lines acc =
51 try
52 let line = input_line ic in
53 read_lines (line :: acc)
54 with End_of_file -> List.rev acc
55 in
56 let files = read_lines [] in
57 let _ = Unix.close_process_in ic in
58 files
59
60let humanize_bytes bytes =
61 let kb = Int64.div bytes 1024L in
62 let mb = Int64.div kb 1024L in
63 let gb = Int64.div mb 1024L in
64 if Int64.compare gb 0L > 0 then Printf.sprintf "%.1f GB" (Int64.to_float gb)
65 else if Int64.compare mb 0L > 0 then Printf.sprintf "%.1f MB" (Int64.to_float mb)
66 else if Int64.compare kb 0L > 0 then Printf.sprintf "%Ld KB" kb
67 else Printf.sprintf "%Ld B" bytes
68
69(* Statistics collection *)
70type registry_stats = {
71 total_files: int;
72 total_entries: int;
73 total_size_estimate: int64;
74 years: (string * int) list;
75 file_types: (string * int) list;
76 hash_algorithms: (string * int) list;
77}
78
79let analyze_filename filename =
80 let basename = Filename.basename filename in
81 let year = if String.contains basename '_' then
82 let parts = String.split_on_char '_' basename in
83 List.nth_opt parts 1
84 else None in
85 let extension = if String.contains basename '.' then
86 let parts = String.split_on_char '.' basename in
87 List.nth_opt parts (List.length parts - 1)
88 else None in
89 (year, extension)
90
91let count_map_incr key map =
92 let count = try List.assoc key map with Not_found -> 0 in
93 (key, count + 1) :: List.remove_assoc key map
94
95(* Main processing function *)
96let process_registries registry_dir pattern verbose limit =
97 Printf.printf "🔍 Searching for tessera-manifests registry files...\n";
98 Printf.printf "Directory: %s\n" (expand_tilde registry_dir);
99 Printf.printf "Pattern: %s\n" pattern;
100 if limit > 0 then Printf.printf "Limit: %d files\n" limit;
101 Printf.printf "\n";
102
103 let registry_files = find_registry_files registry_dir pattern limit in
104
105 if List.length registry_files = 0 then (
106 Printf.printf "❌ No registry files found matching pattern '%s'\n" pattern;
107 Printf.printf "Try checking the directory path or adjusting the pattern.\n";
108 exit 1
109 );
110
111 Printf.printf "📋 Found %d registry files\n\n" (List.length registry_files);
112
113 let total_files = List.length registry_files in
114
115 (* Create a simple progress display *)
116 let start_time = Unix.gettimeofday () in
117 let print_progress i =
118 let pct = (float_of_int (i + 1) /. float_of_int total_files) *. 100.0 in
119 let elapsed = Unix.gettimeofday () -. start_time in
120 let eta = if i > 0 then
121 elapsed /. float_of_int (i + 1) *. float_of_int (total_files - i - 1)
122 else 0.0 in
123 let bar_width = 30 in
124 let filled = int_of_float (pct /. 100.0 *. float_of_int bar_width) in
125 let bar = String.make filled '#' ^ String.make (bar_width - filled) '-' in
126 Printf.printf "\r🚀 [%s] %.1f%% (%d/%d) ETA: %.0fs "
127 bar pct (i + 1) total_files eta;
128 flush stdout
129 in
130
131 let stats = ref {
132 total_files = 0;
133 total_entries = 0;
134 total_size_estimate = 0L;
135 years = [];
136 file_types = [];
137 hash_algorithms = [];
138 } in
139
140 (* Process each registry file *)
141 List.iteri (fun i file_path ->
142 try
143 if not verbose then print_progress i;
144 if verbose then Printf.printf "📄 Processing: %s\n" file_path;
145
146 (* Load registry file *)
147 let ic = open_in file_path in
148 let content = really_input_string ic (in_channel_length ic) in
149 close_in ic;
150
151 (* Parse registry with progress feedback if verbose *)
152 let registry = Registry.of_string ~progress:(fun current total ->
153 if verbose && current mod 100 = 0 then
154 Printf.printf "\r 📝 Parsing: %d/%d lines (%.0f%%)"
155 current total ((float_of_int current /. float_of_int total) *. 100.0)
156 ) content in
157 if verbose then Printf.printf "\r ✅ Parsed %d lines%s\n"
158 (List.length (String.split_on_char '\n' content)) (String.make 30 ' ');
159 let entries = Registry.entries registry in
160 let entry_count = List.length entries in
161
162 if verbose then (
163 Printf.printf " └─ %d entries\n" entry_count;
164
165 (* Show a few sample entries *)
166 let sample_size = min 3 entry_count in
167 let samples = take sample_size entries in
168 List.iter (fun entry ->
169 Printf.printf " • %s (%s)\n"
170 (Registry.filename entry)
171 (Hash.to_string (Registry.hash entry))
172 ) samples;
173 if entry_count > sample_size then
174 Printf.printf " ... and %d more\n" (entry_count - sample_size);
175 Printf.printf "\n"
176 );
177
178 (* Collect statistics *)
179 stats := { !stats with
180 total_files = !stats.total_files + 1;
181 total_entries = !stats.total_entries + entry_count;
182 };
183
184 (* Analyze entries for additional statistics *)
185 List.iter (fun entry ->
186 let filename = Registry.filename entry in
187 let hash = Registry.hash entry in
188 let (year, extension) = analyze_filename filename in
189
190 (* Track years *)
191 (match year with
192 | Some y -> stats := { !stats with years = count_map_incr y !stats.years }
193 | None -> ());
194
195 (* Track file types *)
196 (match extension with
197 | Some ext -> stats := { !stats with file_types = count_map_incr ext !stats.file_types }
198 | None -> ());
199
200 (* Track hash algorithms *)
201 let algo_str = match Hash.algorithm hash with
202 | Hash.SHA256 -> "SHA256"
203 | Hash.SHA1 -> "SHA1"
204 | Hash.MD5 -> "MD5" in
205 stats := { !stats with hash_algorithms = count_map_incr algo_str !stats.hash_algorithms };
206
207 (* Estimate file size (very rough - assume average 1MB per file) *)
208 stats := { !stats with total_size_estimate = Int64.add !stats.total_size_estimate 1048576L };
209
210 ) entries;
211
212
213 with exn ->
214 Printf.printf "❌ Error processing %s: %s\n" file_path (Printexc.to_string exn)
215 ) registry_files;
216
217 if not verbose then Printf.printf "\r%s\r✅ Processing complete!\n" (String.make 80 ' ');
218
219 Printf.printf "\n📊 TESSERA REGISTRY ANALYSIS SUMMARY\n";
220 Printf.printf "=====================================\n\n";
221
222 Printf.printf "📁 Registry Files: %d\n" !stats.total_files;
223 Printf.printf "📄 Total Data Entries: %s\n"
224 (Printf.sprintf "%d" !stats.total_entries |>
225 fun s -> String.fold_left (fun acc c ->
226 if String.length acc mod 4 = 3 then acc ^ "," ^ String.make 1 c
227 else acc ^ String.make 1 c) "" s);
228 Printf.printf "💾 Estimated Data Size: %s\n" (humanize_bytes !stats.total_size_estimate);
229 Printf.printf "\n";
230
231 (* Show top categories *)
232 let show_top_list title items =
233 if List.length items > 0 then (
234 Printf.printf "🏆 %s:\n" title;
235 let sorted = List.sort (fun (_, a) (_, b) -> compare b a) items in
236 let top5 = take (min 5 (List.length sorted)) sorted in
237 List.iter (fun (name, count) ->
238 Printf.printf " • %s: %d entries\n" name count
239 ) top5;
240 Printf.printf "\n"
241 )
242 in
243
244 show_top_list "Years by Entry Count" !stats.years;
245 show_top_list "File Types" !stats.file_types;
246 show_top_list "Hash Algorithms" !stats.hash_algorithms;
247
248 Printf.printf "✨ Analysis complete!\n";
249 0
250
251(* Command line interface *)
252let tessera_cmd =
253 let doc = "Load and analyze tessera-manifests registry files" in
254 let info = Cmd.info "tessera-loader" ~doc in
255 Cmd.v info Term.(const process_registries $ registry_dir $ pattern $ verbose $ limit)
256
257let () =
258 match Cmd.eval_value tessera_cmd with
259 | Ok (`Ok exit_code) -> exit exit_code
260 | Ok (`Version | `Help) -> exit 0
261 | Error _ -> exit 1