My agentic slop goes here. Not intended for anyone else!
at main 10 kB view raw
1(* publicsuffix.ml - Public Suffix List implementation for OCaml 2 3 This implements the PSL algorithm as specified at: 4 https://publicsuffix.org/list/ 5 6 Algorithm summary: 7 1. Match domain against all rules 8 2. If no rules match, the prevailing rule is "*" (implicit wildcard) 9 3. If more than one rule matches, exception rules take priority 10 4. If no exception rule, the rule with the most labels wins 11 5. If the prevailing rule is an exception, remove its leftmost label 12 6. The public suffix is the labels matching the prevailing rule 13 7. The registrable domain is the public suffix plus one additional label 14*) 15 16(* Use types from generated data *) 17type section = Publicsuffix_data.section = ICANN | Private 18 19(* Bring the trie_node type and its fields into scope *) 20open Publicsuffix_data 21 22type t = { 23 root : trie_node; 24} 25 26type error = 27 | Empty_domain 28 | Invalid_domain of string 29 | Leading_dot 30 | Punycode_error of string 31 | No_public_suffix 32 | Domain_is_public_suffix 33 34let pp_error fmt = function 35 | Empty_domain -> Format.fprintf fmt "Empty domain" 36 | Invalid_domain s -> Format.fprintf fmt "Invalid domain: %s" s 37 | Leading_dot -> Format.fprintf fmt "Domain has a leading dot" 38 | Punycode_error s -> Format.fprintf fmt "Punycode conversion error: %s" s 39 | No_public_suffix -> Format.fprintf fmt "No public suffix found" 40 | Domain_is_public_suffix -> Format.fprintf fmt "Domain is itself a public suffix" 41 42let error_to_string err = 43 Format.asprintf "%a" pp_error err 44 45let create () = 46 { root = Publicsuffix_data.get_root () } 47 48(* Find a child node by label (case-insensitive) *) 49let find_child (node : trie_node) label = 50 let label_lower = String.lowercase_ascii label in 51 List.find_opt (fun (l, _) -> String.lowercase_ascii l = label_lower) node.children 52 |> Option.map snd 53 54(** Result of matching a domain against the trie *) 55type match_result = { 56 matched_labels : int; (* Number of labels matched *) 57 rule_type : rule_type; (* Type of the matching rule *) 58 section : section; (* Section of the rule *) 59 is_exception : bool; (* Whether this is an exception rule *) 60} 61 62(** Find all matching rules for a domain. 63 Labels should be in reverse order (TLD first). *) 64let find_matches (root : trie_node) labels = 65 let matches = ref [] in 66 67 (* Track whether we matched the implicit * rule *) 68 let implicit_match = { 69 matched_labels = 1; 70 rule_type = Wildcard; 71 section = ICANN; (* Implicit rule is considered ICANN *) 72 is_exception = false; 73 } in 74 75 let rec traverse (node : trie_node) depth remaining_labels = 76 (* Check if current node has a rule *) 77 (match node.rule with 78 | Some (rt, sec) -> 79 let m = { 80 matched_labels = depth; 81 rule_type = rt; 82 section = sec; 83 is_exception = (rt = Exception); 84 } in 85 matches := m :: !matches 86 | None -> ()); 87 88 (* Continue traversing if we have more labels *) 89 match remaining_labels with 90 | [] -> () 91 | label :: rest -> 92 (* Check for wildcard match *) 93 (match node.wildcard_child with 94 | Some wc -> 95 (match wc.rule with 96 | Some (rt, sec) -> 97 let m = { 98 matched_labels = depth + 1; 99 rule_type = rt; 100 section = sec; 101 is_exception = (rt = Exception); 102 } in 103 matches := m :: !matches 104 | None -> ()) 105 | None -> ()); 106 107 (* Check for exact label match *) 108 match find_child node label with 109 | Some child -> traverse child (depth + 1) rest 110 | None -> () 111 in 112 113 traverse root 0 labels; 114 115 (* If no matches, return the implicit * rule *) 116 if !matches = [] then [implicit_match] 117 else !matches 118 119(** Select the prevailing rule from a list of matches. 120 Per the algorithm: 121 1. Exception rules take priority 122 2. Otherwise, the rule with the most labels wins 123*) 124let select_prevailing_rule matches = 125 (* First, check for exception rules *) 126 let exceptions = List.filter (fun m -> m.is_exception) matches in 127 match exceptions with 128 | ex :: _ -> ex (* Exception rules take priority *) 129 | [] -> 130 (* Find the rule with the most labels *) 131 List.fold_left (fun best m -> 132 if m.matched_labels > best.matched_labels then m else best 133 ) (List.hd matches) matches 134 135(** Normalize a domain for lookup: 136 - Convert to lowercase 137 - Convert IDN to Punycode 138 - Split into labels 139 - Handle trailing dots 140*) 141let normalize_domain domain = 142 if domain = "" then Error Empty_domain 143 else if String.length domain > 0 && domain.[0] = '.' then Error Leading_dot 144 else begin 145 (* Check for and preserve trailing dot *) 146 let has_trailing_dot = 147 String.length domain > 0 && domain.[String.length domain - 1] = '.' 148 in 149 let domain = 150 if has_trailing_dot then 151 String.sub domain 0 (String.length domain - 1) 152 else domain 153 in 154 if domain = "" then Error Empty_domain 155 else 156 (* Convert IDN to ASCII (Punycode) *) 157 match Punycode_idna.to_ascii domain with 158 | Error e -> 159 let msg = Format.asprintf "%a" Punycode_idna.pp_error e in 160 Error (Punycode_error msg) 161 | Ok ascii_domain -> 162 (* Convert to lowercase *) 163 let ascii_lower = String.lowercase_ascii ascii_domain in 164 (* Split into labels *) 165 let labels = String.split_on_char '.' ascii_lower in 166 (* Filter empty labels (shouldn't happen after normalization) *) 167 let labels = List.filter (fun s -> s <> "") labels in 168 if labels = [] then Error Empty_domain 169 else Ok (labels, has_trailing_dot) 170 end 171 172(** Convert labels back to a domain string *) 173let labels_to_domain labels has_trailing_dot = 174 let domain = String.concat "." labels in 175 if has_trailing_dot then domain ^ "." else domain 176 177let public_suffix_with_section t domain = 178 match normalize_domain domain with 179 | Error e -> Error e 180 | Ok (labels, has_trailing_dot) -> 181 (* Reverse labels for trie traversal (TLD first) *) 182 let rev_labels = List.rev labels in 183 (* Find all matching rules *) 184 let matches = find_matches t.root rev_labels in 185 (* Select the prevailing rule *) 186 let prevailing = select_prevailing_rule matches in 187 (* Determine the number of suffix labels *) 188 let suffix_label_count = 189 if prevailing.is_exception then 190 (* Exception rules: remove leftmost label from the rule *) 191 prevailing.matched_labels - 1 192 else 193 prevailing.matched_labels 194 in 195 (* Extract the suffix labels from the domain *) 196 let n = List.length labels in 197 if suffix_label_count > n then 198 Error No_public_suffix 199 else begin 200 let suffix_labels = 201 (* Take the rightmost suffix_label_count labels *) 202 let rec take_last n lst = 203 if List.length lst <= n then lst 204 else take_last n (List.tl lst) 205 in 206 take_last suffix_label_count labels 207 in 208 let suffix = labels_to_domain suffix_labels has_trailing_dot in 209 Ok (suffix, prevailing.section) 210 end 211 212let public_suffix t domain = 213 match public_suffix_with_section t domain with 214 | Ok (suffix, _) -> Ok suffix 215 | Error e -> Error e 216 217let registrable_domain_with_section t domain = 218 match normalize_domain domain with 219 | Error e -> Error e 220 | Ok (labels, has_trailing_dot) -> 221 (* Reverse labels for trie traversal (TLD first) *) 222 let rev_labels = List.rev labels in 223 (* Find all matching rules *) 224 let matches = find_matches t.root rev_labels in 225 (* Select the prevailing rule *) 226 let prevailing = select_prevailing_rule matches in 227 (* Determine the number of suffix labels *) 228 let suffix_label_count = 229 if prevailing.is_exception then 230 prevailing.matched_labels - 1 231 else 232 prevailing.matched_labels 233 in 234 let n = List.length labels in 235 (* Registrable domain = suffix + 1 label *) 236 let reg_label_count = suffix_label_count + 1 in 237 if reg_label_count > n then 238 (* Domain is a public suffix or shorter *) 239 Error Domain_is_public_suffix 240 else begin 241 let reg_labels = 242 let rec take_last n lst = 243 if List.length lst <= n then lst 244 else take_last n (List.tl lst) 245 in 246 take_last reg_label_count labels 247 in 248 let reg_domain = labels_to_domain reg_labels has_trailing_dot in 249 Ok (reg_domain, prevailing.section) 250 end 251 252let registrable_domain t domain = 253 match registrable_domain_with_section t domain with 254 | Ok (domain, _) -> Ok domain 255 | Error e -> Error e 256 257let is_public_suffix t domain = 258 match normalize_domain domain with 259 | Error e -> Error e 260 | Ok (labels, _) -> 261 let rev_labels = List.rev labels in 262 let matches = find_matches t.root rev_labels in 263 let prevailing = select_prevailing_rule matches in 264 let suffix_label_count = 265 if prevailing.is_exception then 266 prevailing.matched_labels - 1 267 else 268 prevailing.matched_labels 269 in 270 (* Domain is a public suffix if it has exactly suffix_label_count labels *) 271 Ok (List.length labels = suffix_label_count) 272 273let is_registrable_domain t domain = 274 match normalize_domain domain with 275 | Error e -> Error e 276 | Ok (labels, _) -> 277 let rev_labels = List.rev labels in 278 let matches = find_matches t.root rev_labels in 279 let prevailing = select_prevailing_rule matches in 280 let suffix_label_count = 281 if prevailing.is_exception then 282 prevailing.matched_labels - 1 283 else 284 prevailing.matched_labels 285 in 286 let reg_label_count = suffix_label_count + 1 in 287 (* Domain is registrable if it has exactly reg_label_count labels *) 288 Ok (List.length labels = reg_label_count) 289 290let rule_count _t = Publicsuffix_data.rule_count 291let icann_rule_count _t = Publicsuffix_data.icann_rule_count 292let private_rule_count _t = Publicsuffix_data.private_rule_count