OCaml library for Crockford's Base32
at main 12 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Crockford Base32 encoding for OCaml 7 8 {1 Overview} 9 10 Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford 11 for human-readable identifiers. It is particularly well-suited for use in URLs, 12 user-facing identifiers, and systems where humans need to transcribe or 13 communicate encoded values. 14 15 See the {{:https://www.crockford.com/base32.html}Crockford Base32 Specification} 16 for complete details of the encoding scheme. 17 18 It features: 19 20 {ul 21 {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes 22 letters that are easily confused: I, L, O, and U. This prevents common 23 transcription errors.} 24 {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted 25 during decoding, making it forgiving of human input.} 26 {- {b Confusable character mapping}: When decoding, the letters I and L are 27 automatically mapped to 1, and O is mapped to 0, further reducing 28 transcription errors.} 29 {- {b Hyphenation support}: Hyphens can be included for readability and are 30 automatically ignored during decoding.} 31 {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect 32 transcription errors. The checksum is encoded as two additional characters.} 33 {- {b URL-safe}: All characters in the encoding are safe for use in URLs 34 without escaping.} 35 } 36 37 {2 The Encoding Alphabet} 38 39 The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ] 40 41 Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V 42 respectively. 43 44 {2 Comparison with Other Encodings} 45 46 {ul 47 {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced 48 character set and case-insensitivity, though it produces slightly longer 49 strings (base32 uses 5 bits per character vs base64's 6 bits).} 50 {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses 51 only 4 bits per character) and includes more letters for better distribution.} 52 {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically 53 optimized for human readability with its character mappings and exclusions.} 54 } 55 56 {1 Examples} 57 58 {[ 59 (* Basic encoding *) 60 let id = encode 123456789L;; 61 (* Result: "3rv5k1" *) 62 63 (* Encoding with hyphenation for readability *) 64 let id = encode ~split_every:4 123456789L;; 65 (* Result: "3rv5-k1" *) 66 67 (* Encoding with checksum for error detection *) 68 let id = encode ~checksum:true 123456789L;; 69 (* Result: "3rv5k187" (last two digits are checksum) *) 70 71 (* Generate a random 8-character identifier *) 72 Random.self_init ();; 73 let random_id = generate ~length:8 ();; 74 (* Result: something like "n4g9k2c7" *) 75 76 (* Generate with checksum and hyphenation *) 77 let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();; 78 (* Result: something like "a3k2x-9m4c82" *) 79 80 (* Decoding is case-insensitive and ignores hyphens *) 81 let n = decode "3RV5-K1";; 82 (* Result: 123456789L *) 83 84 (* Decode with checksum validation *) 85 let n = decode ~checksum:true "3rv5k187";; 86 (* Result: 123456789L (or raises Decode_error if checksum invalid) *) 87 ]} 88 89 {1 API Documentation} *) 90 91(** {1 ID Generation} 92 93 Generate random identifiers in Crockford base32 format. This is useful for 94 creating unique, human-readable IDs for databases, URLs, or user-facing 95 reference numbers. *) 96 97val generate : 98 length:int -> 99 ?split_every:int -> 100 ?checksum:bool -> 101 ?rng:(float -> float) -> 102 unit -> string 103(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string. 104 105 This function creates a random identifier by generating a random integer and 106 encoding it using the Crockford base32 alphabet. The generated IDs are suitable 107 for use as database keys, URL-safe identifiers, or user-visible reference numbers. 108 109 When using the default [Random.float] generator, you must initialize the 110 random number generator with [Random.self_init] before calling this function. 111 112 @param length The target length of the generated string. When [checksum:false], 113 this is the exact output length. When [checksum:true], this is the 114 total length including the 2-character checksum, so the random 115 portion will be [length - 2] characters. 116 @param split_every Insert hyphens every N characters for improved readability. 117 For example, [split_every]=[4] might produce [3a7k-m9n2]. 118 Default: no splitting. 119 @param checksum Append a 2-character ISO 7064 checksum for error detection. 120 Useful when IDs will be manually transcribed. When [true], 121 the total output length (including checksum) will be [length]. 122 Default: [false]. 123 @param rng Custom random number generator function that takes a float bound and 124 returns a random float in the range [0,bound]. This allows for 125 deterministic testing or custom entropy sources. Defaults to using [Random.float]. 126 @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3] 127 as at least 1 character is needed for the ID and 2 for the checksum. *) 128 129 130(** {1 Error Types} *) 131 132type invalid_length = { length: int; message: string } 133(** Error for invalid length parameters *) 134 135type invalid_character = { char: char; message: string } 136(** Error for invalid characters during decoding *) 137 138type invalid_checksum = { checksum: string; message: string } 139(** Error for invalid checksum format *) 140 141type checksum_mismatch = { expected: int64; got: int64; identifier: string } 142(** Error for checksum validation failures *) 143 144type decode_error = 145 | Invalid_length of invalid_length 146 | Invalid_character of invalid_character 147 | Invalid_checksum of invalid_checksum 148 | Checksum_mismatch of checksum_mismatch 149(** Union of all possible decode errors *) 150 151exception Decode_error of decode_error 152(** Main exception raised for all decoding errors *) 153 154val pp_invalid_length : Format.formatter -> invalid_length -> unit 155(** Pretty-print an invalid_length error *) 156 157val pp_invalid_character : Format.formatter -> invalid_character -> unit 158(** Pretty-print an invalid_character error *) 159 160val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit 161(** Pretty-print an invalid_checksum error *) 162 163val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit 164(** Pretty-print a checksum_mismatch error *) 165 166val pp_decode_error : Format.formatter -> decode_error -> unit 167(** Pretty-print a decode_error *) 168 169(** {1 Constants} *) 170 171val encoding_chars : string 172(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"] 173 174 This 32-character alphabet excludes I, L, O, and U to prevent confusion with 175 visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive 176 for decoding but returned in lowercase by encoding functions. *) 177 178(** {1 Encoding and Decoding} 179 180 The core encoding and decoding functions convert between 64-bit integers and 181 their Crockford base32 string representations. *) 182 183val encode : 184 ?split_every:int -> 185 ?min_length:int -> 186 ?checksum:bool -> 187 int64 -> string 188(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string. 189 190 The function converts a 64-bit integer into a base32 representation using the 191 Crockford alphabet. The encoding process divides the number by 32 repeatedly, 192 using the remainder as an index into the alphabet. 193 194 @param split_every Insert hyphens every N characters for readability. For example, 195 [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens 196 are ignored during decoding. Default: no splitting. 197 @param min_length Pad the output with leading zeros to reach this minimum length. 198 When [checksum:true], the minimum length includes the 2-character 199 checksum. Default: no padding. 200 @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription 201 errors. The checksum is computed on the original number and encoded 202 as two additional base32 characters. Default: [false]. 203 204 {b Examples:} 205 {[ 206 encode 0L;; (* "0" *) 207 encode 1234L;; (* "16j" *) 208 encode ~min_length:6 1234L;; (* "00016j" *) 209 encode ~split_every:3 123456L;; (* "3rv-5k" *) 210 encode ~checksum:true 1234L;; (* "16j48" *) 211 encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *) 212 ]} *) 213 214val decode : ?checksum:bool -> string -> int64 215(** [decode ?checksum str] decodes a Crockford base32 string to int64. 216 217 The function is designed to be forgiving of human input: 218 - Case-insensitive: accepts both uppercase and lowercase letters 219 - Strips hyphens automatically 220 - Maps confusable characters: I/i and L/l → 1, O/o → 0 221 222 @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum. 223 If [true], the function verifies that the checksum matches the 224 decoded value. Default: [false]. 225 226 @raise Decode_error with one of the following variants: 227 - [Invalid_character] if an unrecognized character is encountered 228 - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format 229 - [Checksum_mismatch] if the checksum doesn't match the decoded value 230 231 {b Examples:} 232 {[ 233 decode "16j";; (* 1234L *) 234 decode "16J";; (* 1234L - case insensitive *) 235 decode "1-6-j";; (* 1234L - hyphens ignored *) 236 decode "I6j";; (* 1234L - 'I' mapped to '1' *) 237 decode ~checksum:true "16j48";; (* 1234L - with checksum validation *) 238 ]} *) 239 240(** {1 Utility Functions} 241 242 Low-level functions for working with Crockford base32 strings and checksums. *) 243 244val normalize : string -> string 245(** [normalize str] normalizes a string for decoding. 246 247 This function prepares a potentially messy human input string for decoding by: 248 - Converting all characters to lowercase 249 - Removing all hyphens ([-]) 250 - Mapping confusable characters: [I] and [L] → [1], [O] → [0] 251 252 This is automatically called by {!decode}, but is exposed for cases where 253 you want to normalize strings before storage or comparison. 254 255 {b Examples:} 256 {[ 257 normalize "ABC-123";; (* "abc123" *) 258 normalize "IlO";; (* "110" - confusables mapped *) 259 normalize "A-B-C";; (* "abc" - hyphens removed *) 260 normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *) 261 ]} *) 262 263val validate : int64 -> checksum:int64 -> bool 264(** [validate n ~checksum] validates that a checksum matches the expected value for a number. 265 266 This function computes the ISO 7064 (mod 97-10) checksum for the given number 267 and compares it with the provided checksum value. 268 269 @param n The integer value to validate 270 @param checksum The expected checksum value (0-96) 271 @return [true] if the checksum is valid, [false] otherwise 272 273 {b Examples:} 274 {[ 275 let cs = generate_checksum 1234L in 276 validate 1234L ~checksum:cs;; (* true *) 277 validate 1234L ~checksum:99L;; (* false *) 278 ]} *) 279 280val generate_checksum : int64 -> int64 281(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number. 282 283 The ISO 7064 algorithm provides a checksum that can detect: 284 - All single-digit errors 285 - Most adjacent transposition errors 286 - Most twin errors (where two identical digits are replaced by two other identical digits) 287 288 The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits). 289 290 {b Examples:} 291 {[ 292 generate_checksum 0L;; (* 1L *) 293 generate_checksum 1234L;; (* 48L *) 294 generate_checksum 123456L;; (* 87L *) 295 ]} *)