OCaml library for Crockford's Base32
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Crockford Base32 encoding for OCaml
7
8 {1 Overview}
9
10 Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
11 for human-readable identifiers. It is particularly well-suited for use in URLs,
12 user-facing identifiers, and systems where humans need to transcribe or
13 communicate encoded values. It features:
14
15 {ul
16 {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
17 letters that are easily confused: I, L, O, and U. This prevents common
18 transcription errors.}
19 {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
20 during decoding, making it forgiving of human input.}
21 {- {b Confusable character mapping}: When decoding, the letters I and L are
22 automatically mapped to 1, and O is mapped to 0, further reducing
23 transcription errors.}
24 {- {b Hyphenation support}: Hyphens can be included for readability and are
25 automatically ignored during decoding.}
26 {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
27 transcription errors. The checksum is encoded as two additional characters.}
28 {- {b URL-safe}: All characters in the encoding are safe for use in URLs
29 without escaping.}
30 }
31
32 {2 The Encoding Alphabet}
33
34 The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
35
36 Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
37 respectively.
38
39 {2 Comparison with Other Encodings}
40
41 {ul
42 {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
43 character set and case-insensitivity, though it produces slightly longer
44 strings (base32 uses 5 bits per character vs base64's 6 bits).}
45 {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
46 only 4 bits per character) and includes more letters for better distribution.}
47 {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
48 optimized for human readability with its character mappings and exclusions.}
49 }
50
51 {1 Examples}
52
53 {[
54 (* Basic encoding *)
55 let id = encode 123456789L;;
56 (* Result: "3rv5k1" *)
57
58 (* Encoding with hyphenation for readability *)
59 let id = encode ~split_every:4 123456789L;;
60 (* Result: "3rv5-k1" *)
61
62 (* Encoding with checksum for error detection *)
63 let id = encode ~checksum:true 123456789L;;
64 (* Result: "3rv5k187" (last two digits are checksum) *)
65
66 (* Generate a random 8-character identifier *)
67 Random.self_init ();;
68 let random_id = generate ~length:8 ();;
69 (* Result: something like "n4g9k2c7" *)
70
71 (* Generate with checksum and hyphenation *)
72 let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
73 (* Result: something like "a3k2x-9m4c82" *)
74
75 (* Decoding is case-insensitive and ignores hyphens *)
76 let n = decode "3RV5-K1";;
77 (* Result: 123456789L *)
78
79 (* Decode with checksum validation *)
80 let n = decode ~checksum:true "3rv5k187";;
81 (* Result: 123456789L (or raises Decode_error if checksum invalid) *)
82 ]}
83
84 {1 API Documentation} *)
85
86(** {1 ID Generation}
87
88 Generate random identifiers in Crockford base32 format. This is useful for
89 creating unique, human-readable IDs for databases, URLs, or user-facing
90 reference numbers. *)
91
92val generate :
93 length:int ->
94 ?split_every:int ->
95 ?checksum:bool ->
96 ?rng:(float -> float) ->
97 unit -> string
98(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
99
100 This function creates a random identifier by generating a random integer and
101 encoding it using the Crockford base32 alphabet. The generated IDs are suitable
102 for use as database keys, URL-safe identifiers, or user-visible reference numbers.
103
104 When using the default [Random.float] generator, you must initialize the
105 random number generator with {!Random.self_init} before calling this function.
106
107 @param length The target length of the generated string. When [checksum:false],
108 this is the exact output length. When [checksum:true], this is the
109 total length including the 2-character checksum, so the random
110 portion will be [length - 2] characters.
111 @param split_every Insert hyphens every N characters for improved readability.
112 For example, [split_every]=[4] might produce [3a7k-m9n2].
113 Default: no splitting.
114 @param checksum Append a 2-character ISO 7064 checksum for error detection.
115 Useful when IDs will be manually transcribed. When [true],
116 the total output length (including checksum) will be [length].
117 Default: [false].
118 @param rng Custom random number generator function that takes a float bound and
119 returns a random float in the range [0,bound]. This allows for
120 deterministic testing or custom entropy sources. Defaults to using {!Random.float}.
121 @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
122 as at least 1 character is needed for the ID and 2 for the checksum. *)
123
124
125(** {1 Error Types} *)
126
127type invalid_length = { length: int; message: string }
128(** Error for invalid length parameters *)
129
130type invalid_character = { char: char; message: string }
131(** Error for invalid characters during decoding *)
132
133type invalid_checksum = { checksum: string; message: string }
134(** Error for invalid checksum format *)
135
136type checksum_mismatch = { expected: int64; got: int64; identifier: string }
137(** Error for checksum validation failures *)
138
139type decode_error =
140 | Invalid_length of invalid_length
141 | Invalid_character of invalid_character
142 | Invalid_checksum of invalid_checksum
143 | Checksum_mismatch of checksum_mismatch
144(** Union of all possible decode errors *)
145
146exception Decode_error of decode_error
147(** Main exception raised for all decoding errors *)
148
149val pp_invalid_length : Format.formatter -> invalid_length -> unit
150(** Pretty-print an invalid_length error *)
151
152val pp_invalid_character : Format.formatter -> invalid_character -> unit
153(** Pretty-print an invalid_character error *)
154
155val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit
156(** Pretty-print an invalid_checksum error *)
157
158val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit
159(** Pretty-print a checksum_mismatch error *)
160
161val pp_decode_error : Format.formatter -> decode_error -> unit
162(** Pretty-print a decode_error *)
163
164(** {1 Constants} *)
165
166val encoding_chars : string
167(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
168
169 This 32-character alphabet excludes I, L, O, and U to prevent confusion with
170 visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
171 for decoding but returned in lowercase by encoding functions. *)
172
173(** {1 Encoding and Decoding}
174
175 The core encoding and decoding functions convert between 64-bit integers and
176 their Crockford base32 string representations. *)
177
178val encode :
179 ?split_every:int ->
180 ?min_length:int ->
181 ?checksum:bool ->
182 int64 -> string
183(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
184
185 The function converts a 64-bit integer into a base32 representation using the
186 Crockford alphabet. The encoding process divides the number by 32 repeatedly,
187 using the remainder as an index into the alphabet.
188
189 @param split_every Insert hyphens every N characters for readability. For example,
190 [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
191 are ignored during decoding. Default: no splitting.
192 @param min_length Pad the output with leading zeros to reach this minimum length.
193 When [checksum:true], the minimum length includes the 2-character
194 checksum. Default: no padding.
195 @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
196 errors. The checksum is computed on the original number and encoded
197 as two additional base32 characters. Default: [false].
198
199 {b Examples:}
200 {[
201 encode 0L;; (* "0" *)
202 encode 1234L;; (* "16j" *)
203 encode ~min_length:6 1234L;; (* "00016j" *)
204 encode ~split_every:3 123456L;; (* "3rv-5k" *)
205 encode ~checksum:true 1234L;; (* "16j48" *)
206 encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
207 ]} *)
208
209val decode : ?checksum:bool -> string -> int64
210(** [decode ?checksum str] decodes a Crockford base32 string to int64.
211
212 The function is designed to be forgiving of human input:
213 - Case-insensitive: accepts both uppercase and lowercase letters
214 - Strips hyphens automatically
215 - Maps confusable characters: I/i and L/l → 1, O/o → 0
216
217 @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
218 If [true], the function verifies that the checksum matches the
219 decoded value. Default: [false].
220
221 @raise Decode_error with one of the following variants:
222 - [Invalid_character] if an unrecognized character is encountered
223 - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
224 - [Checksum_mismatch] if the checksum doesn't match the decoded value
225
226 {b Examples:}
227 {[
228 decode "16j";; (* 1234L *)
229 decode "16J";; (* 1234L - case insensitive *)
230 decode "1-6-j";; (* 1234L - hyphens ignored *)
231 decode "I6j";; (* 1234L - 'I' mapped to '1' *)
232 decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
233 ]} *)
234
235(** {1 Utility Functions}
236
237 Low-level functions for working with Crockford base32 strings and checksums. *)
238
239val normalize : string -> string
240(** [normalize str] normalizes a string for decoding.
241
242 This function prepares a potentially messy human input string for decoding by:
243 - Converting all characters to lowercase
244 - Removing all hyphens ([-])
245 - Mapping confusable characters: [I] and [L] → [1], [O] → [0]
246
247 This is automatically called by {!decode}, but is exposed for cases where
248 you want to normalize strings before storage or comparison.
249
250 {b Examples:}
251 {[
252 normalize "ABC-123";; (* "abc123" *)
253 normalize "IlO";; (* "110" - confusables mapped *)
254 normalize "A-B-C";; (* "abc" - hyphens removed *)
255 normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
256 ]} *)
257
258val validate : int64 -> checksum:int64 -> bool
259(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
260
261 This function computes the ISO 7064 (mod 97-10) checksum for the given number
262 and compares it with the provided checksum value.
263
264 @param n The integer value to validate
265 @param checksum The expected checksum value (0-96)
266 @return [true] if the checksum is valid, [false] otherwise
267
268 {b Examples:}
269 {[
270 let cs = generate_checksum 1234L in
271 validate 1234L ~checksum:cs;; (* true *)
272 validate 1234L ~checksum:99L;; (* false *)
273 ]} *)
274
275val generate_checksum : int64 -> int64
276(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
277
278 The ISO 7064 algorithm provides a checksum that can detect:
279 - All single-digit errors
280 - Most adjacent transposition errors
281 - Most twin errors (where two identical digits are replaced by two other identical digits)
282
283 The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
284
285 {b Examples:}
286 {[
287 generate_checksum 0L;; (* 1L *)
288 generate_checksum 1234L;; (* 48L *)
289 generate_checksum 123456L;; (* 87L *)
290 ]} *)