OCaml library for Crockford's Base32
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Crockford Base32 encoding for OCaml
7
8 {1 Overview}
9
10 Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
11 for human-readable identifiers. It is particularly well-suited for use in URLs,
12 user-facing identifiers, and systems where humans need to transcribe or
13 communicate encoded values.
14
15 See the {{:https://www.crockford.com/base32.html}Crockford Base32 Specification}
16 for complete details of the encoding scheme.
17
18 It features:
19
20 {ul
21 {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
22 letters that are easily confused: I, L, O, and U. This prevents common
23 transcription errors.}
24 {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
25 during decoding, making it forgiving of human input.}
26 {- {b Confusable character mapping}: When decoding, the letters I and L are
27 automatically mapped to 1, and O is mapped to 0, further reducing
28 transcription errors.}
29 {- {b Hyphenation support}: Hyphens can be included for readability and are
30 automatically ignored during decoding.}
31 {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
32 transcription errors. The checksum is encoded as two additional characters.}
33 {- {b URL-safe}: All characters in the encoding are safe for use in URLs
34 without escaping.}
35 }
36
37 {2 The Encoding Alphabet}
38
39 The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
40
41 Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
42 respectively.
43
44 {2 Comparison with Other Encodings}
45
46 {ul
47 {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
48 character set and case-insensitivity, though it produces slightly longer
49 strings (base32 uses 5 bits per character vs base64's 6 bits).}
50 {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
51 only 4 bits per character) and includes more letters for better distribution.}
52 {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
53 optimized for human readability with its character mappings and exclusions.}
54 }
55
56 {1 Examples}
57
58 {[
59 (* Basic encoding *)
60 let id = encode 123456789L;;
61 (* Result: "3rv5k1" *)
62
63 (* Encoding with hyphenation for readability *)
64 let id = encode ~split_every:4 123456789L;;
65 (* Result: "3rv5-k1" *)
66
67 (* Encoding with checksum for error detection *)
68 let id = encode ~checksum:true 123456789L;;
69 (* Result: "3rv5k187" (last two digits are checksum) *)
70
71 (* Generate a random 8-character identifier *)
72 Random.self_init ();;
73 let random_id = generate ~length:8 ();;
74 (* Result: something like "n4g9k2c7" *)
75
76 (* Generate with checksum and hyphenation *)
77 let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
78 (* Result: something like "a3k2x-9m4c82" *)
79
80 (* Decoding is case-insensitive and ignores hyphens *)
81 let n = decode "3RV5-K1";;
82 (* Result: 123456789L *)
83
84 (* Decode with checksum validation *)
85 let n = decode ~checksum:true "3rv5k187";;
86 (* Result: 123456789L (or raises Decode_error if checksum invalid) *)
87 ]}
88
89 {1 API Documentation} *)
90
91(** {1 ID Generation}
92
93 Generate random identifiers in Crockford base32 format. This is useful for
94 creating unique, human-readable IDs for databases, URLs, or user-facing
95 reference numbers. *)
96
97val generate :
98 length:int ->
99 ?split_every:int ->
100 ?checksum:bool ->
101 ?rng:(float -> float) ->
102 unit -> string
103(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
104
105 This function creates a random identifier by generating a random integer and
106 encoding it using the Crockford base32 alphabet. The generated IDs are suitable
107 for use as database keys, URL-safe identifiers, or user-visible reference numbers.
108
109 When using the default [Random.float] generator, you must initialize the
110 random number generator with [Random.self_init] before calling this function.
111
112 @param length The target length of the generated string. When [checksum:false],
113 this is the exact output length. When [checksum:true], this is the
114 total length including the 2-character checksum, so the random
115 portion will be [length - 2] characters.
116 @param split_every Insert hyphens every N characters for improved readability.
117 For example, [split_every]=[4] might produce [3a7k-m9n2].
118 Default: no splitting.
119 @param checksum Append a 2-character ISO 7064 checksum for error detection.
120 Useful when IDs will be manually transcribed. When [true],
121 the total output length (including checksum) will be [length].
122 Default: [false].
123 @param rng Custom random number generator function that takes a float bound and
124 returns a random float in the range [0,bound]. This allows for
125 deterministic testing or custom entropy sources. Defaults to using [Random.float].
126 @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
127 as at least 1 character is needed for the ID and 2 for the checksum. *)
128
129
130(** {1 Error Types} *)
131
132type invalid_length = { length: int; message: string }
133(** Error for invalid length parameters *)
134
135type invalid_character = { char: char; message: string }
136(** Error for invalid characters during decoding *)
137
138type invalid_checksum = { checksum: string; message: string }
139(** Error for invalid checksum format *)
140
141type checksum_mismatch = { expected: int64; got: int64; identifier: string }
142(** Error for checksum validation failures *)
143
144type decode_error =
145 | Invalid_length of invalid_length
146 | Invalid_character of invalid_character
147 | Invalid_checksum of invalid_checksum
148 | Checksum_mismatch of checksum_mismatch
149(** Union of all possible decode errors *)
150
151exception Decode_error of decode_error
152(** Main exception raised for all decoding errors *)
153
154val pp_invalid_length : Format.formatter -> invalid_length -> unit
155(** Pretty-print an invalid_length error *)
156
157val pp_invalid_character : Format.formatter -> invalid_character -> unit
158(** Pretty-print an invalid_character error *)
159
160val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit
161(** Pretty-print an invalid_checksum error *)
162
163val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit
164(** Pretty-print a checksum_mismatch error *)
165
166val pp_decode_error : Format.formatter -> decode_error -> unit
167(** Pretty-print a decode_error *)
168
169(** {1 Constants} *)
170
171val encoding_chars : string
172(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
173
174 This 32-character alphabet excludes I, L, O, and U to prevent confusion with
175 visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
176 for decoding but returned in lowercase by encoding functions. *)
177
178(** {1 Encoding and Decoding}
179
180 The core encoding and decoding functions convert between 64-bit integers and
181 their Crockford base32 string representations. *)
182
183val encode :
184 ?split_every:int ->
185 ?min_length:int ->
186 ?checksum:bool ->
187 int64 -> string
188(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
189
190 The function converts a 64-bit integer into a base32 representation using the
191 Crockford alphabet. The encoding process divides the number by 32 repeatedly,
192 using the remainder as an index into the alphabet.
193
194 @param split_every Insert hyphens every N characters for readability. For example,
195 [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
196 are ignored during decoding. Default: no splitting.
197 @param min_length Pad the output with leading zeros to reach this minimum length.
198 When [checksum:true], the minimum length includes the 2-character
199 checksum. Default: no padding.
200 @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
201 errors. The checksum is computed on the original number and encoded
202 as two additional base32 characters. Default: [false].
203
204 {b Examples:}
205 {[
206 encode 0L;; (* "0" *)
207 encode 1234L;; (* "16j" *)
208 encode ~min_length:6 1234L;; (* "00016j" *)
209 encode ~split_every:3 123456L;; (* "3rv-5k" *)
210 encode ~checksum:true 1234L;; (* "16j48" *)
211 encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
212 ]} *)
213
214val decode : ?checksum:bool -> string -> int64
215(** [decode ?checksum str] decodes a Crockford base32 string to int64.
216
217 The function is designed to be forgiving of human input:
218 - Case-insensitive: accepts both uppercase and lowercase letters
219 - Strips hyphens automatically
220 - Maps confusable characters: I/i and L/l → 1, O/o → 0
221
222 @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
223 If [true], the function verifies that the checksum matches the
224 decoded value. Default: [false].
225
226 @raise Decode_error with one of the following variants:
227 - [Invalid_character] if an unrecognized character is encountered
228 - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
229 - [Checksum_mismatch] if the checksum doesn't match the decoded value
230
231 {b Examples:}
232 {[
233 decode "16j";; (* 1234L *)
234 decode "16J";; (* 1234L - case insensitive *)
235 decode "1-6-j";; (* 1234L - hyphens ignored *)
236 decode "I6j";; (* 1234L - 'I' mapped to '1' *)
237 decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
238 ]} *)
239
240(** {1 Utility Functions}
241
242 Low-level functions for working with Crockford base32 strings and checksums. *)
243
244val normalize : string -> string
245(** [normalize str] normalizes a string for decoding.
246
247 This function prepares a potentially messy human input string for decoding by:
248 - Converting all characters to lowercase
249 - Removing all hyphens ([-])
250 - Mapping confusable characters: [I] and [L] → [1], [O] → [0]
251
252 This is automatically called by {!decode}, but is exposed for cases where
253 you want to normalize strings before storage or comparison.
254
255 {b Examples:}
256 {[
257 normalize "ABC-123";; (* "abc123" *)
258 normalize "IlO";; (* "110" - confusables mapped *)
259 normalize "A-B-C";; (* "abc" - hyphens removed *)
260 normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
261 ]} *)
262
263val validate : int64 -> checksum:int64 -> bool
264(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
265
266 This function computes the ISO 7064 (mod 97-10) checksum for the given number
267 and compares it with the provided checksum value.
268
269 @param n The integer value to validate
270 @param checksum The expected checksum value (0-96)
271 @return [true] if the checksum is valid, [false] otherwise
272
273 {b Examples:}
274 {[
275 let cs = generate_checksum 1234L in
276 validate 1234L ~checksum:cs;; (* true *)
277 validate 1234L ~checksum:99L;; (* false *)
278 ]} *)
279
280val generate_checksum : int64 -> int64
281(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
282
283 The ISO 7064 algorithm provides a checksum that can detect:
284 - All single-digit errors
285 - Most adjacent transposition errors
286 - Most twin errors (where two identical digits are replaced by two other identical digits)
287
288 The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
289
290 {b Examples:}
291 {[
292 generate_checksum 0L;; (* 1L *)
293 generate_checksum 1234L;; (* 48L *)
294 generate_checksum 123456L;; (* 87L *)
295 ]} *)