···
SPDX-License-Identifier: MIT
---------------------------------------------------------------------------*)
6
-
(** Crockford Base32 encoding for OCaml *)
6
+
(** Crockford Base32 encoding for OCaml
10
+
Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
11
+
for human-readable identifiers. It is particularly well-suited for use in URLs,
12
+
user-facing identifiers, and systems where humans need to transcribe or
13
+
communicate encoded values. It features:
16
+
{- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
17
+
letters that are easily confused: I, L, O, and U. This prevents common
18
+
transcription errors.}
19
+
{- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
20
+
during decoding, making it forgiving of human input.}
21
+
{- {b Confusable character mapping}: When decoding, the letters I and L are
22
+
automatically mapped to 1, and O is mapped to 0, further reducing
23
+
transcription errors.}
24
+
{- {b Hyphenation support}: Hyphens can be included for readability and are
25
+
automatically ignored during decoding.}
26
+
{- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
27
+
transcription errors. The checksum is encoded as two additional characters.}
28
+
{- {b URL-safe}: All characters in the encoding are safe for use in URLs
32
+
{2 The Encoding Alphabet}
34
+
The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
36
+
Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
39
+
{2 Comparison with Other Encodings}
42
+
{- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
43
+
character set and case-insensitivity, though it produces slightly longer
44
+
strings (base32 uses 5 bits per character vs base64's 6 bits).}
45
+
{- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
46
+
only 4 bits per character) and includes more letters for better distribution.}
47
+
{- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
48
+
optimized for human readability with its character mappings and exclusions.}
54
+
(* Basic encoding *)
55
+
let id = encode 123456789L;;
56
+
(* Result: "3rv5k1" *)
58
+
(* Encoding with hyphenation for readability *)
59
+
let id = encode ~split_every:4 123456789L;;
60
+
(* Result: "3rv5-k1" *)
62
+
(* Encoding with checksum for error detection *)
63
+
let id = encode ~checksum:true 123456789L;;
64
+
(* Result: "3rv5k187" (last two digits are checksum) *)
66
+
(* Generate a random 8-character identifier *)
67
+
Random.self_init ();;
68
+
let random_id = generate ~length:8 ();;
69
+
(* Result: something like "n4g9k2c7" *)
71
+
(* Generate with checksum and hyphenation *)
72
+
let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
73
+
(* Result: something like "a3k2x-9m4c82" *)
75
+
(* Decoding is case-insensitive and ignores hyphens *)
76
+
let n = decode "3RV5-K1";;
77
+
(* Result: 123456789L *)
79
+
(* Decode with checksum validation *)
80
+
let n = decode ~checksum:true "3rv5k187";;
81
+
(* Result: 123456789L (or raises Decode_error if checksum invalid) *)
84
+
{1 API Documentation} *)
86
+
(** {1 ID Generation}
88
+
Generate random identifiers in Crockford base32 format. This is useful for
89
+
creating unique, human-readable IDs for databases, URLs, or user-facing
90
+
reference numbers. *)
96
+
?rng:(float -> float) ->
98
+
(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
100
+
This function creates a random identifier by generating a random integer and
101
+
encoding it using the Crockford base32 alphabet. The generated IDs are suitable
102
+
for use as database keys, URL-safe identifiers, or user-visible reference numbers.
104
+
When using the default [Random.float] generator, you must initialize the
105
+
random number generator with {!Random.self_init} before calling this function.
107
+
@param length The target length of the generated string. When [checksum:false],
108
+
this is the exact output length. When [checksum:true], this is the
109
+
total length including the 2-character checksum, so the random
110
+
portion will be [length - 2] characters.
111
+
@param split_every Insert hyphens every N characters for improved readability.
112
+
For example, [split_every]=[4] might produce [3a7k-m9n2].
113
+
Default: no splitting.
114
+
@param checksum Append a 2-character ISO 7064 checksum for error detection.
115
+
Useful when IDs will be manually transcribed. When [true],
116
+
the total output length (including checksum) will be [length].
118
+
@param rng Custom random number generator function that takes a float bound and
119
+
returns a random float in the range [0,bound]. This allows for
120
+
deterministic testing or custom entropy sources. Defaults to using {!Random.float}.
121
+
@raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
122
+
as at least 1 character is needed for the ID and 2 for the checksum. *)
···
val encoding_chars : string
50
-
(** The Crockford base32 encoding alphabet (excludes i, l, o, u) *)
167
+
(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
52
-
(** {1 Encoding and Decoding} *)
169
+
This 32-character alphabet excludes I, L, O, and U to prevent confusion with
170
+
visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
171
+
for decoding but returned in lowercase by encoding functions. *)
173
+
(** {1 Encoding and Decoding}
175
+
The core encoding and decoding functions convert between 64-bit integers and
176
+
their Crockford base32 string representations. *)
···
(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
60
-
@param split_every Split the output with '-' every n characters (default: no splitting)
61
-
@param min_length Pad with zeros to this minimum length (default: no padding)
62
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false) *)
185
+
The function converts a 64-bit integer into a base32 representation using the
186
+
Crockford alphabet. The encoding process divides the number by 32 repeatedly,
187
+
using the remainder as an index into the alphabet.
189
+
@param split_every Insert hyphens every N characters for readability. For example,
190
+
[split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
191
+
are ignored during decoding. Default: no splitting.
192
+
@param min_length Pad the output with leading zeros to reach this minimum length.
193
+
When [checksum:true], the minimum length includes the 2-character
194
+
checksum. Default: no padding.
195
+
@param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
196
+
errors. The checksum is computed on the original number and encoded
197
+
as two additional base32 characters. Default: [false].
201
+
encode 0L;; (* "0" *)
202
+
encode 1234L;; (* "16j" *)
203
+
encode ~min_length:6 1234L;; (* "00016j" *)
204
+
encode ~split_every:3 123456L;; (* "3rv-5k" *)
205
+
encode ~checksum:true 1234L;; (* "16j48" *)
206
+
encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
val decode : ?checksum:bool -> string -> int64
(** [decode ?checksum str] decodes a Crockford base32 string to int64.
66
-
@param checksum Expect and validate ISO 7064 checksum (default: false)
67
-
@raise Decode_error if decoding fails (invalid characters, invalid checksum format, or checksum mismatch) *)
69
-
(** {1 ID Generation} *)
212
+
The function is designed to be forgiving of human input:
213
+
- Case-insensitive: accepts both uppercase and lowercase letters
214
+
- Strips hyphens automatically
215
+
- Maps confusable characters: I/i and L/l → 1, O/o → 0
217
+
@param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
218
+
If [true], the function verifies that the checksum matches the
219
+
decoded value. Default: [false].
221
+
@raise Decode_error with one of the following variants:
222
+
- [Invalid_character] if an unrecognized character is encountered
223
+
- [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
224
+
- [Checksum_mismatch] if the checksum doesn't match the decoded value
76
-
(** [generate ~length ?split_every ?checksum ()] generates a random Crockford base32 string.
77
-
@param length The length of the generated string (excluding checksum)
78
-
@param split_every Split the output with '-' every n characters (default: no splitting)
79
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false)
80
-
@raise Decode_error if checksum is true and length < 3
228
+
decode "16j";; (* 1234L *)
229
+
decode "16J";; (* 1234L - case insensitive *)
230
+
decode "1-6-j";; (* 1234L - hyphens ignored *)
231
+
decode "I6j";; (* 1234L - 'I' mapped to '1' *)
232
+
decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
82
-
Note: Caller must initialize Random module with {!Random.self_init} before use *)
235
+
(** {1 Utility Functions}
84
-
(** {1 Utility Functions} *)
237
+
Low-level functions for working with Crockford base32 strings and checksums. *)
val normalize : string -> string
87
-
(** [normalize str] normalizes a string for decoding by converting to lowercase,
88
-
removing dashes, and mapping confusable characters (i→1, l→1, o→0) *)
240
+
(** [normalize str] normalizes a string for decoding.
242
+
This function prepares a potentially messy human input string for decoding by:
243
+
- Converting all characters to lowercase
244
+
- Removing all hyphens ([-])
245
+
- Mapping confusable characters: [I] and [L] → [1], [O] → [0]
247
+
This is automatically called by {!decode}, but is exposed for cases where
248
+
you want to normalize strings before storage or comparison.
252
+
normalize "ABC-123";; (* "abc123" *)
253
+
normalize "IlO";; (* "110" - confusables mapped *)
254
+
normalize "A-B-C";; (* "abc" - hyphens removed *)
255
+
normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
val validate : int64 -> checksum:int64 -> bool
91
-
(** [validate n ~checksum] validates that a checksum matches the number *)
259
+
(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
261
+
This function computes the ISO 7064 (mod 97-10) checksum for the given number
262
+
and compares it with the provided checksum value.
264
+
@param n The integer value to validate
265
+
@param checksum The expected checksum value (0-96)
266
+
@return [true] if the checksum is valid, [false] otherwise
270
+
let cs = generate_checksum 1234L in
271
+
validate 1234L ~checksum:cs;; (* true *)
272
+
validate 1234L ~checksum:99L;; (* false *)
val generate_checksum : int64 -> int64
94
-
(** [generate_checksum n] generates an ISO 7064 (mod 97-10) checksum for a number *)
276
+
(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
278
+
The ISO 7064 algorithm provides a checksum that can detect:
279
+
- All single-digit errors
280
+
- Most adjacent transposition errors
281
+
- Most twin errors (where two identical digits are replaced by two other identical digits)
283
+
The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
287
+
generate_checksum 0L;; (* 1L *)
288
+
generate_checksum 1234L;; (* 48L *)
289
+
generate_checksum 123456L;; (* 87L *)