My agentic slop goes here. Not intended for anyone else!
1(* publicsuffix.ml - Public Suffix List implementation for OCaml
2
3 This implements the PSL algorithm as specified at:
4 https://publicsuffix.org/list/
5
6 Algorithm summary:
7 1. Match domain against all rules
8 2. If no rules match, the prevailing rule is "*" (implicit wildcard)
9 3. If more than one rule matches, exception rules take priority
10 4. If no exception rule, the rule with the most labels wins
11 5. If the prevailing rule is an exception, remove its leftmost label
12 6. The public suffix is the labels matching the prevailing rule
13 7. The registrable domain is the public suffix plus one additional label
14*)
15
16(* Use types from generated data *)
17type section = Publicsuffix_data.section = ICANN | Private
18
19(* Bring the trie_node type and its fields into scope *)
20open Publicsuffix_data
21
22type t = {
23 root : trie_node;
24}
25
26type error =
27 | Empty_domain
28 | Invalid_domain of string
29 | Leading_dot
30 | Punycode_error of string
31 | No_public_suffix
32 | Domain_is_public_suffix
33
34let pp_error fmt = function
35 | Empty_domain -> Format.fprintf fmt "Empty domain"
36 | Invalid_domain s -> Format.fprintf fmt "Invalid domain: %s" s
37 | Leading_dot -> Format.fprintf fmt "Domain has a leading dot"
38 | Punycode_error s -> Format.fprintf fmt "Punycode conversion error: %s" s
39 | No_public_suffix -> Format.fprintf fmt "No public suffix found"
40 | Domain_is_public_suffix -> Format.fprintf fmt "Domain is itself a public suffix"
41
42let error_to_string err =
43 Format.asprintf "%a" pp_error err
44
45let create () =
46 { root = Publicsuffix_data.get_root () }
47
48(* Find a child node by label (case-insensitive) *)
49let find_child (node : trie_node) label =
50 let label_lower = String.lowercase_ascii label in
51 List.find_opt (fun (l, _) -> String.lowercase_ascii l = label_lower) node.children
52 |> Option.map snd
53
54(** Result of matching a domain against the trie *)
55type match_result = {
56 matched_labels : int; (* Number of labels matched *)
57 rule_type : rule_type; (* Type of the matching rule *)
58 section : section; (* Section of the rule *)
59 is_exception : bool; (* Whether this is an exception rule *)
60}
61
62(** Find all matching rules for a domain.
63 Labels should be in reverse order (TLD first). *)
64let find_matches (root : trie_node) labels =
65 let matches = ref [] in
66
67 (* Track whether we matched the implicit * rule *)
68 let implicit_match = {
69 matched_labels = 1;
70 rule_type = Wildcard;
71 section = ICANN; (* Implicit rule is considered ICANN *)
72 is_exception = false;
73 } in
74
75 let rec traverse (node : trie_node) depth remaining_labels =
76 (* Check if current node has a rule *)
77 (match node.rule with
78 | Some (rt, sec) ->
79 let m = {
80 matched_labels = depth;
81 rule_type = rt;
82 section = sec;
83 is_exception = (rt = Exception);
84 } in
85 matches := m :: !matches
86 | None -> ());
87
88 (* Continue traversing if we have more labels *)
89 match remaining_labels with
90 | [] -> ()
91 | label :: rest ->
92 (* Check for wildcard match *)
93 (match node.wildcard_child with
94 | Some wc ->
95 (match wc.rule with
96 | Some (rt, sec) ->
97 let m = {
98 matched_labels = depth + 1;
99 rule_type = rt;
100 section = sec;
101 is_exception = (rt = Exception);
102 } in
103 matches := m :: !matches
104 | None -> ())
105 | None -> ());
106
107 (* Check for exact label match *)
108 match find_child node label with
109 | Some child -> traverse child (depth + 1) rest
110 | None -> ()
111 in
112
113 traverse root 0 labels;
114
115 (* If no matches, return the implicit * rule *)
116 if !matches = [] then [implicit_match]
117 else !matches
118
119(** Select the prevailing rule from a list of matches.
120 Per the algorithm:
121 1. Exception rules take priority
122 2. Otherwise, the rule with the most labels wins
123*)
124let select_prevailing_rule matches =
125 (* First, check for exception rules *)
126 let exceptions = List.filter (fun m -> m.is_exception) matches in
127 match exceptions with
128 | ex :: _ -> ex (* Exception rules take priority *)
129 | [] ->
130 (* Find the rule with the most labels *)
131 List.fold_left (fun best m ->
132 if m.matched_labels > best.matched_labels then m else best
133 ) (List.hd matches) matches
134
135(** Normalize a domain for lookup:
136 - Convert to lowercase
137 - Convert IDN to Punycode
138 - Split into labels
139 - Handle trailing dots
140*)
141let normalize_domain domain =
142 if domain = "" then Error Empty_domain
143 else if String.length domain > 0 && domain.[0] = '.' then Error Leading_dot
144 else begin
145 (* Check for and preserve trailing dot *)
146 let has_trailing_dot =
147 String.length domain > 0 && domain.[String.length domain - 1] = '.'
148 in
149 let domain =
150 if has_trailing_dot then
151 String.sub domain 0 (String.length domain - 1)
152 else domain
153 in
154 if domain = "" then Error Empty_domain
155 else
156 (* Convert IDN to ASCII (Punycode) *)
157 match Punycode_idna.to_ascii domain with
158 | Error e ->
159 let msg = Format.asprintf "%a" Punycode_idna.pp_error e in
160 Error (Punycode_error msg)
161 | Ok ascii_domain ->
162 (* Convert to lowercase *)
163 let ascii_lower = String.lowercase_ascii ascii_domain in
164 (* Split into labels *)
165 let labels = String.split_on_char '.' ascii_lower in
166 (* Filter empty labels (shouldn't happen after normalization) *)
167 let labels = List.filter (fun s -> s <> "") labels in
168 if labels = [] then Error Empty_domain
169 else Ok (labels, has_trailing_dot)
170 end
171
172(** Convert labels back to a domain string *)
173let labels_to_domain labels has_trailing_dot =
174 let domain = String.concat "." labels in
175 if has_trailing_dot then domain ^ "." else domain
176
177let public_suffix_with_section t domain =
178 match normalize_domain domain with
179 | Error e -> Error e
180 | Ok (labels, has_trailing_dot) ->
181 (* Reverse labels for trie traversal (TLD first) *)
182 let rev_labels = List.rev labels in
183 (* Find all matching rules *)
184 let matches = find_matches t.root rev_labels in
185 (* Select the prevailing rule *)
186 let prevailing = select_prevailing_rule matches in
187 (* Determine the number of suffix labels *)
188 let suffix_label_count =
189 if prevailing.is_exception then
190 (* Exception rules: remove leftmost label from the rule *)
191 prevailing.matched_labels - 1
192 else
193 prevailing.matched_labels
194 in
195 (* Extract the suffix labels from the domain *)
196 let n = List.length labels in
197 if suffix_label_count > n then
198 Error No_public_suffix
199 else begin
200 let suffix_labels =
201 (* Take the rightmost suffix_label_count labels *)
202 let rec take_last n lst =
203 if List.length lst <= n then lst
204 else take_last n (List.tl lst)
205 in
206 take_last suffix_label_count labels
207 in
208 let suffix = labels_to_domain suffix_labels has_trailing_dot in
209 Ok (suffix, prevailing.section)
210 end
211
212let public_suffix t domain =
213 match public_suffix_with_section t domain with
214 | Ok (suffix, _) -> Ok suffix
215 | Error e -> Error e
216
217let registrable_domain_with_section t domain =
218 match normalize_domain domain with
219 | Error e -> Error e
220 | Ok (labels, has_trailing_dot) ->
221 (* Reverse labels for trie traversal (TLD first) *)
222 let rev_labels = List.rev labels in
223 (* Find all matching rules *)
224 let matches = find_matches t.root rev_labels in
225 (* Select the prevailing rule *)
226 let prevailing = select_prevailing_rule matches in
227 (* Determine the number of suffix labels *)
228 let suffix_label_count =
229 if prevailing.is_exception then
230 prevailing.matched_labels - 1
231 else
232 prevailing.matched_labels
233 in
234 let n = List.length labels in
235 (* Registrable domain = suffix + 1 label *)
236 let reg_label_count = suffix_label_count + 1 in
237 if reg_label_count > n then
238 (* Domain is a public suffix or shorter *)
239 Error Domain_is_public_suffix
240 else begin
241 let reg_labels =
242 let rec take_last n lst =
243 if List.length lst <= n then lst
244 else take_last n (List.tl lst)
245 in
246 take_last reg_label_count labels
247 in
248 let reg_domain = labels_to_domain reg_labels has_trailing_dot in
249 Ok (reg_domain, prevailing.section)
250 end
251
252let registrable_domain t domain =
253 match registrable_domain_with_section t domain with
254 | Ok (domain, _) -> Ok domain
255 | Error e -> Error e
256
257let is_public_suffix t domain =
258 match normalize_domain domain with
259 | Error e -> Error e
260 | Ok (labels, _) ->
261 let rev_labels = List.rev labels in
262 let matches = find_matches t.root rev_labels in
263 let prevailing = select_prevailing_rule matches in
264 let suffix_label_count =
265 if prevailing.is_exception then
266 prevailing.matched_labels - 1
267 else
268 prevailing.matched_labels
269 in
270 (* Domain is a public suffix if it has exactly suffix_label_count labels *)
271 Ok (List.length labels = suffix_label_count)
272
273let is_registrable_domain t domain =
274 match normalize_domain domain with
275 | Error e -> Error e
276 | Ok (labels, _) ->
277 let rev_labels = List.rev labels in
278 let matches = find_matches t.root rev_labels in
279 let prevailing = select_prevailing_rule matches in
280 let suffix_label_count =
281 if prevailing.is_exception then
282 prevailing.matched_labels - 1
283 else
284 prevailing.matched_labels
285 in
286 let reg_label_count = suffix_label_count + 1 in
287 (* Domain is registrable if it has exactly reg_label_count labels *)
288 Ok (List.length labels = reg_label_count)
289
290let rule_count _t = Publicsuffix_data.rule_count
291let icann_rule_count _t = Publicsuffix_data.icann_rule_count
292let private_rule_count _t = Publicsuffix_data.private_rule_count