OCaml library for Crockford's Base32
at v1.0.0 12 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Crockford Base32 encoding for OCaml 7 8 {1 Overview} 9 10 Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford 11 for human-readable identifiers. It is particularly well-suited for use in URLs, 12 user-facing identifiers, and systems where humans need to transcribe or 13 communicate encoded values. It features: 14 15 {ul 16 {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes 17 letters that are easily confused: I, L, O, and U. This prevents common 18 transcription errors.} 19 {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted 20 during decoding, making it forgiving of human input.} 21 {- {b Confusable character mapping}: When decoding, the letters I and L are 22 automatically mapped to 1, and O is mapped to 0, further reducing 23 transcription errors.} 24 {- {b Hyphenation support}: Hyphens can be included for readability and are 25 automatically ignored during decoding.} 26 {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect 27 transcription errors. The checksum is encoded as two additional characters.} 28 {- {b URL-safe}: All characters in the encoding are safe for use in URLs 29 without escaping.} 30 } 31 32 {2 The Encoding Alphabet} 33 34 The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ] 35 36 Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V 37 respectively. 38 39 {2 Comparison with Other Encodings} 40 41 {ul 42 {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced 43 character set and case-insensitivity, though it produces slightly longer 44 strings (base32 uses 5 bits per character vs base64's 6 bits).} 45 {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses 46 only 4 bits per character) and includes more letters for better distribution.} 47 {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically 48 optimized for human readability with its character mappings and exclusions.} 49 } 50 51 {1 Examples} 52 53 {[ 54 (* Basic encoding *) 55 let id = encode 123456789L;; 56 (* Result: "3rv5k1" *) 57 58 (* Encoding with hyphenation for readability *) 59 let id = encode ~split_every:4 123456789L;; 60 (* Result: "3rv5-k1" *) 61 62 (* Encoding with checksum for error detection *) 63 let id = encode ~checksum:true 123456789L;; 64 (* Result: "3rv5k187" (last two digits are checksum) *) 65 66 (* Generate a random 8-character identifier *) 67 Random.self_init ();; 68 let random_id = generate ~length:8 ();; 69 (* Result: something like "n4g9k2c7" *) 70 71 (* Generate with checksum and hyphenation *) 72 let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();; 73 (* Result: something like "a3k2x-9m4c82" *) 74 75 (* Decoding is case-insensitive and ignores hyphens *) 76 let n = decode "3RV5-K1";; 77 (* Result: 123456789L *) 78 79 (* Decode with checksum validation *) 80 let n = decode ~checksum:true "3rv5k187";; 81 (* Result: 123456789L (or raises Decode_error if checksum invalid) *) 82 ]} 83 84 {1 API Documentation} *) 85 86(** {1 ID Generation} 87 88 Generate random identifiers in Crockford base32 format. This is useful for 89 creating unique, human-readable IDs for databases, URLs, or user-facing 90 reference numbers. *) 91 92val generate : 93 length:int -> 94 ?split_every:int -> 95 ?checksum:bool -> 96 ?rng:(float -> float) -> 97 unit -> string 98(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string. 99 100 This function creates a random identifier by generating a random integer and 101 encoding it using the Crockford base32 alphabet. The generated IDs are suitable 102 for use as database keys, URL-safe identifiers, or user-visible reference numbers. 103 104 When using the default [Random.float] generator, you must initialize the 105 random number generator with {!Random.self_init} before calling this function. 106 107 @param length The target length of the generated string. When [checksum:false], 108 this is the exact output length. When [checksum:true], this is the 109 total length including the 2-character checksum, so the random 110 portion will be [length - 2] characters. 111 @param split_every Insert hyphens every N characters for improved readability. 112 For example, [split_every]=[4] might produce [3a7k-m9n2]. 113 Default: no splitting. 114 @param checksum Append a 2-character ISO 7064 checksum for error detection. 115 Useful when IDs will be manually transcribed. When [true], 116 the total output length (including checksum) will be [length]. 117 Default: [false]. 118 @param rng Custom random number generator function that takes a float bound and 119 returns a random float in the range [0,bound]. This allows for 120 deterministic testing or custom entropy sources. Defaults to using {!Random.float}. 121 @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3] 122 as at least 1 character is needed for the ID and 2 for the checksum. *) 123 124 125(** {1 Error Types} *) 126 127type invalid_length = { length: int; message: string } 128(** Error for invalid length parameters *) 129 130type invalid_character = { char: char; message: string } 131(** Error for invalid characters during decoding *) 132 133type invalid_checksum = { checksum: string; message: string } 134(** Error for invalid checksum format *) 135 136type checksum_mismatch = { expected: int64; got: int64; identifier: string } 137(** Error for checksum validation failures *) 138 139type decode_error = 140 | Invalid_length of invalid_length 141 | Invalid_character of invalid_character 142 | Invalid_checksum of invalid_checksum 143 | Checksum_mismatch of checksum_mismatch 144(** Union of all possible decode errors *) 145 146exception Decode_error of decode_error 147(** Main exception raised for all decoding errors *) 148 149val pp_invalid_length : Format.formatter -> invalid_length -> unit 150(** Pretty-print an invalid_length error *) 151 152val pp_invalid_character : Format.formatter -> invalid_character -> unit 153(** Pretty-print an invalid_character error *) 154 155val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit 156(** Pretty-print an invalid_checksum error *) 157 158val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit 159(** Pretty-print a checksum_mismatch error *) 160 161val pp_decode_error : Format.formatter -> decode_error -> unit 162(** Pretty-print a decode_error *) 163 164(** {1 Constants} *) 165 166val encoding_chars : string 167(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"] 168 169 This 32-character alphabet excludes I, L, O, and U to prevent confusion with 170 visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive 171 for decoding but returned in lowercase by encoding functions. *) 172 173(** {1 Encoding and Decoding} 174 175 The core encoding and decoding functions convert between 64-bit integers and 176 their Crockford base32 string representations. *) 177 178val encode : 179 ?split_every:int -> 180 ?min_length:int -> 181 ?checksum:bool -> 182 int64 -> string 183(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string. 184 185 The function converts a 64-bit integer into a base32 representation using the 186 Crockford alphabet. The encoding process divides the number by 32 repeatedly, 187 using the remainder as an index into the alphabet. 188 189 @param split_every Insert hyphens every N characters for readability. For example, 190 [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens 191 are ignored during decoding. Default: no splitting. 192 @param min_length Pad the output with leading zeros to reach this minimum length. 193 When [checksum:true], the minimum length includes the 2-character 194 checksum. Default: no padding. 195 @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription 196 errors. The checksum is computed on the original number and encoded 197 as two additional base32 characters. Default: [false]. 198 199 {b Examples:} 200 {[ 201 encode 0L;; (* "0" *) 202 encode 1234L;; (* "16j" *) 203 encode ~min_length:6 1234L;; (* "00016j" *) 204 encode ~split_every:3 123456L;; (* "3rv-5k" *) 205 encode ~checksum:true 1234L;; (* "16j48" *) 206 encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *) 207 ]} *) 208 209val decode : ?checksum:bool -> string -> int64 210(** [decode ?checksum str] decodes a Crockford base32 string to int64. 211 212 The function is designed to be forgiving of human input: 213 - Case-insensitive: accepts both uppercase and lowercase letters 214 - Strips hyphens automatically 215 - Maps confusable characters: I/i and L/l → 1, O/o → 0 216 217 @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum. 218 If [true], the function verifies that the checksum matches the 219 decoded value. Default: [false]. 220 221 @raise Decode_error with one of the following variants: 222 - [Invalid_character] if an unrecognized character is encountered 223 - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format 224 - [Checksum_mismatch] if the checksum doesn't match the decoded value 225 226 {b Examples:} 227 {[ 228 decode "16j";; (* 1234L *) 229 decode "16J";; (* 1234L - case insensitive *) 230 decode "1-6-j";; (* 1234L - hyphens ignored *) 231 decode "I6j";; (* 1234L - 'I' mapped to '1' *) 232 decode ~checksum:true "16j48";; (* 1234L - with checksum validation *) 233 ]} *) 234 235(** {1 Utility Functions} 236 237 Low-level functions for working with Crockford base32 strings and checksums. *) 238 239val normalize : string -> string 240(** [normalize str] normalizes a string for decoding. 241 242 This function prepares a potentially messy human input string for decoding by: 243 - Converting all characters to lowercase 244 - Removing all hyphens ([-]) 245 - Mapping confusable characters: [I] and [L] → [1], [O] → [0] 246 247 This is automatically called by {!decode}, but is exposed for cases where 248 you want to normalize strings before storage or comparison. 249 250 {b Examples:} 251 {[ 252 normalize "ABC-123";; (* "abc123" *) 253 normalize "IlO";; (* "110" - confusables mapped *) 254 normalize "A-B-C";; (* "abc" - hyphens removed *) 255 normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *) 256 ]} *) 257 258val validate : int64 -> checksum:int64 -> bool 259(** [validate n ~checksum] validates that a checksum matches the expected value for a number. 260 261 This function computes the ISO 7064 (mod 97-10) checksum for the given number 262 and compares it with the provided checksum value. 263 264 @param n The integer value to validate 265 @param checksum The expected checksum value (0-96) 266 @return [true] if the checksum is valid, [false] otherwise 267 268 {b Examples:} 269 {[ 270 let cs = generate_checksum 1234L in 271 validate 1234L ~checksum:cs;; (* true *) 272 validate 1234L ~checksum:99L;; (* false *) 273 ]} *) 274 275val generate_checksum : int64 -> int64 276(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number. 277 278 The ISO 7064 algorithm provides a checksum that can detect: 279 - All single-digit errors 280 - Most adjacent transposition errors 281 - Most twin errors (where two identical digits are replaced by two other identical digits) 282 283 The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits). 284 285 {b Examples:} 286 {[ 287 generate_checksum 0L;; (* 1L *) 288 generate_checksum 1234L;; (* 48L *) 289 generate_checksum 123456L;; (* 87L *) 290 ]} *)