···
SPDX-License-Identifier: MIT
---------------------------------------------------------------------------*)
6
-
(** Crockford Base32 encoding for OCaml *)
6
+
(** Crockford Base32 encoding for OCaml
10
+
Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
11
+
for human-readable identifiers. It is particularly well-suited for use in URLs,
12
+
user-facing identifiers, and systems where humans need to transcribe or
13
+
communicate encoded values.
15
+
See the {{:https://www.crockford.com/base32.html}Crockford Base32 Specification}
16
+
for complete details of the encoding scheme.
21
+
{- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
22
+
letters that are easily confused: I, L, O, and U. This prevents common
23
+
transcription errors.}
24
+
{- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
25
+
during decoding, making it forgiving of human input.}
26
+
{- {b Confusable character mapping}: When decoding, the letters I and L are
27
+
automatically mapped to 1, and O is mapped to 0, further reducing
28
+
transcription errors.}
29
+
{- {b Hyphenation support}: Hyphens can be included for readability and are
30
+
automatically ignored during decoding.}
31
+
{- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
32
+
transcription errors. The checksum is encoded as two additional characters.}
33
+
{- {b URL-safe}: All characters in the encoding are safe for use in URLs
37
+
{2 The Encoding Alphabet}
39
+
The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
41
+
Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
44
+
{2 Comparison with Other Encodings}
47
+
{- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
48
+
character set and case-insensitivity, though it produces slightly longer
49
+
strings (base32 uses 5 bits per character vs base64's 6 bits).}
50
+
{- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
51
+
only 4 bits per character) and includes more letters for better distribution.}
52
+
{- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
53
+
optimized for human readability with its character mappings and exclusions.}
59
+
(* Basic encoding *)
60
+
let id = encode 123456789L;;
61
+
(* Result: "3rv5k1" *)
63
+
(* Encoding with hyphenation for readability *)
64
+
let id = encode ~split_every:4 123456789L;;
65
+
(* Result: "3rv5-k1" *)
67
+
(* Encoding with checksum for error detection *)
68
+
let id = encode ~checksum:true 123456789L;;
69
+
(* Result: "3rv5k187" (last two digits are checksum) *)
71
+
(* Generate a random 8-character identifier *)
72
+
Random.self_init ();;
73
+
let random_id = generate ~length:8 ();;
74
+
(* Result: something like "n4g9k2c7" *)
76
+
(* Generate with checksum and hyphenation *)
77
+
let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
78
+
(* Result: something like "a3k2x-9m4c82" *)
80
+
(* Decoding is case-insensitive and ignores hyphens *)
81
+
let n = decode "3RV5-K1";;
82
+
(* Result: 123456789L *)
84
+
(* Decode with checksum validation *)
85
+
let n = decode ~checksum:true "3rv5k187";;
86
+
(* Result: 123456789L (or raises Decode_error if checksum invalid) *)
89
+
{1 API Documentation} *)
91
+
(** {1 ID Generation}
93
+
Generate random identifiers in Crockford base32 format. This is useful for
94
+
creating unique, human-readable IDs for databases, URLs, or user-facing
95
+
reference numbers. *)
101
+
?rng:(float -> float) ->
103
+
(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
105
+
This function creates a random identifier by generating a random integer and
106
+
encoding it using the Crockford base32 alphabet. The generated IDs are suitable
107
+
for use as database keys, URL-safe identifiers, or user-visible reference numbers.
109
+
When using the default [Random.float] generator, you must initialize the
110
+
random number generator with [Random.self_init] before calling this function.
112
+
@param length The target length of the generated string. When [checksum:false],
113
+
this is the exact output length. When [checksum:true], this is the
114
+
total length including the 2-character checksum, so the random
115
+
portion will be [length - 2] characters.
116
+
@param split_every Insert hyphens every N characters for improved readability.
117
+
For example, [split_every]=[4] might produce [3a7k-m9n2].
118
+
Default: no splitting.
119
+
@param checksum Append a 2-character ISO 7064 checksum for error detection.
120
+
Useful when IDs will be manually transcribed. When [true],
121
+
the total output length (including checksum) will be [length].
123
+
@param rng Custom random number generator function that takes a float bound and
124
+
returns a random float in the range [0,bound]. This allows for
125
+
deterministic testing or custom entropy sources. Defaults to using [Random.float].
126
+
@raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
127
+
as at least 1 character is needed for the ID and 2 for the checksum. *)
···
val encoding_chars : string
50
-
(** The Crockford base32 encoding alphabet (excludes i, l, o, u) *)
172
+
(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
174
+
This 32-character alphabet excludes I, L, O, and U to prevent confusion with
175
+
visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
176
+
for decoding but returned in lowercase by encoding functions. *)
52
-
(** {1 Encoding and Decoding} *)
178
+
(** {1 Encoding and Decoding}
180
+
The core encoding and decoding functions convert between 64-bit integers and
181
+
their Crockford base32 string representations. *)
···
(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
60
-
@param split_every Split the output with '-' every n characters (default: no splitting)
61
-
@param min_length Pad with zeros to this minimum length (default: no padding)
62
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false) *)
190
+
The function converts a 64-bit integer into a base32 representation using the
191
+
Crockford alphabet. The encoding process divides the number by 32 repeatedly,
192
+
using the remainder as an index into the alphabet.
194
+
@param split_every Insert hyphens every N characters for readability. For example,
195
+
[split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
196
+
are ignored during decoding. Default: no splitting.
197
+
@param min_length Pad the output with leading zeros to reach this minimum length.
198
+
When [checksum:true], the minimum length includes the 2-character
199
+
checksum. Default: no padding.
200
+
@param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
201
+
errors. The checksum is computed on the original number and encoded
202
+
as two additional base32 characters. Default: [false].
206
+
encode 0L;; (* "0" *)
207
+
encode 1234L;; (* "16j" *)
208
+
encode ~min_length:6 1234L;; (* "00016j" *)
209
+
encode ~split_every:3 123456L;; (* "3rv-5k" *)
210
+
encode ~checksum:true 1234L;; (* "16j48" *)
211
+
encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
val decode : ?checksum:bool -> string -> int64
(** [decode ?checksum str] decodes a Crockford base32 string to int64.
66
-
@param checksum Expect and validate ISO 7064 checksum (default: false)
67
-
@raise Decode_error if decoding fails (invalid characters, invalid checksum format, or checksum mismatch) *)
69
-
(** {1 ID Generation} *)
217
+
The function is designed to be forgiving of human input:
218
+
- Case-insensitive: accepts both uppercase and lowercase letters
219
+
- Strips hyphens automatically
220
+
- Maps confusable characters: I/i and L/l โ 1, O/o โ 0
222
+
@param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
223
+
If [true], the function verifies that the checksum matches the
224
+
decoded value. Default: [false].
226
+
@raise Decode_error with one of the following variants:
227
+
- [Invalid_character] if an unrecognized character is encountered
228
+
- [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
229
+
- [Checksum_mismatch] if the checksum doesn't match the decoded value
76
-
(** [generate ~length ?split_every ?checksum ()] generates a random Crockford base32 string.
77
-
@param length The length of the generated string (excluding checksum)
78
-
@param split_every Split the output with '-' every n characters (default: no splitting)
79
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false)
80
-
@raise Decode_error if checksum is true and length < 3
233
+
decode "16j";; (* 1234L *)
234
+
decode "16J";; (* 1234L - case insensitive *)
235
+
decode "1-6-j";; (* 1234L - hyphens ignored *)
236
+
decode "I6j";; (* 1234L - 'I' mapped to '1' *)
237
+
decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
82
-
Note: Caller must initialize Random module with {!Random.self_init} before use *)
240
+
(** {1 Utility Functions}
84
-
(** {1 Utility Functions} *)
242
+
Low-level functions for working with Crockford base32 strings and checksums. *)
val normalize : string -> string
87
-
(** [normalize str] normalizes a string for decoding by converting to lowercase,
88
-
removing dashes, and mapping confusable characters (iโ1, lโ1, oโ0) *)
245
+
(** [normalize str] normalizes a string for decoding.
247
+
This function prepares a potentially messy human input string for decoding by:
248
+
- Converting all characters to lowercase
249
+
- Removing all hyphens ([-])
250
+
- Mapping confusable characters: [I] and [L] โ [1], [O] โ [0]
252
+
This is automatically called by {!decode}, but is exposed for cases where
253
+
you want to normalize strings before storage or comparison.
257
+
normalize "ABC-123";; (* "abc123" *)
258
+
normalize "IlO";; (* "110" - confusables mapped *)
259
+
normalize "A-B-C";; (* "abc" - hyphens removed *)
260
+
normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
val validate : int64 -> checksum:int64 -> bool
91
-
(** [validate n ~checksum] validates that a checksum matches the number *)
264
+
(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
266
+
This function computes the ISO 7064 (mod 97-10) checksum for the given number
267
+
and compares it with the provided checksum value.
269
+
@param n The integer value to validate
270
+
@param checksum The expected checksum value (0-96)
271
+
@return [true] if the checksum is valid, [false] otherwise
275
+
let cs = generate_checksum 1234L in
276
+
validate 1234L ~checksum:cs;; (* true *)
277
+
validate 1234L ~checksum:99L;; (* false *)
val generate_checksum : int64 -> int64
94
-
(** [generate_checksum n] generates an ISO 7064 (mod 97-10) checksum for a number *)
281
+
(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
283
+
The ISO 7064 algorithm provides a checksum that can detect:
284
+
- All single-digit errors
285
+
- Most adjacent transposition errors
286
+
- Most twin errors (where two identical digits are replaced by two other identical digits)
288
+
The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
292
+
generate_checksum 0L;; (* 1L *)
293
+
generate_checksum 1234L;; (* 48L *)
294
+
generate_checksum 123456L;; (* 87L *)