lib/crockford.mli at main · anil.recoil.org/ocaml-crockford

OCaml library for Crockford's Base32
ocaml-crockford / lib / crockford.mli
at main 12 kB view raw
  1(*---------------------------------------------------------------------------
  2   Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
  3   SPDX-License-Identifier: MIT
  4  ---------------------------------------------------------------------------*)
  5
  6(** Crockford Base32 encoding for OCaml
  7
  8    {1 Overview}
  9
 10    Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
 11    for human-readable identifiers. It is particularly well-suited for use in URLs,
 12    user-facing identifiers, and systems where humans need to transcribe or
 13    communicate encoded values.
 14
 15    See the {{:https://www.crockford.com/base32.html}Crockford Base32 Specification}
 16    for complete details of the encoding scheme.
 17
 18    It features:
 19
 20    {ul
 21      {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
 22         letters that are easily confused: I, L, O, and U. This prevents common
 23         transcription errors.}
 24      {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
 25         during decoding, making it forgiving of human input.}
 26      {- {b Confusable character mapping}: When decoding, the letters I and L are
 27         automatically mapped to 1, and O is mapped to 0, further reducing
 28         transcription errors.}
 29      {- {b Hyphenation support}: Hyphens can be included for readability and are
 30         automatically ignored during decoding.}
 31      {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
 32         transcription errors. The checksum is encoded as two additional characters.}
 33      {- {b URL-safe}: All characters in the encoding are safe for use in URLs
 34         without escaping.}
 35    }
 36
 37    {2 The Encoding Alphabet}
 38
 39    The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
 40
 41    Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
 42    respectively.
 43
 44    {2 Comparison with Other Encodings}
 45
 46    {ul
 47      {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
 48         character set and case-insensitivity, though it produces slightly longer
 49         strings (base32 uses 5 bits per character vs base64's 6 bits).}
 50      {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
 51         only 4 bits per character) and includes more letters for better distribution.}
 52      {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
 53         optimized for human readability with its character mappings and exclusions.}
 54    }
 55
 56    {1 Examples}
 57
 58    {[
 59      (* Basic encoding *)
 60      let id = encode 123456789L;;
 61      (* Result: "3rv5k1" *)
 62
 63      (* Encoding with hyphenation for readability *)
 64      let id = encode ~split_every:4 123456789L;;
 65      (* Result: "3rv5-k1" *)
 66
 67      (* Encoding with checksum for error detection *)
 68      let id = encode ~checksum:true 123456789L;;
 69      (* Result: "3rv5k187" (last two digits are checksum) *)
 70
 71      (* Generate a random 8-character identifier *)
 72      Random.self_init ();;
 73      let random_id = generate ~length:8 ();;
 74      (* Result: something like "n4g9k2c7" *)
 75
 76      (* Generate with checksum and hyphenation *)
 77      let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
 78      (* Result: something like "a3k2x-9m4c82" *)
 79
 80      (* Decoding is case-insensitive and ignores hyphens *)
 81      let n = decode "3RV5-K1";;
 82      (* Result: 123456789L *)
 83
 84      (* Decode with checksum validation *)
 85      let n = decode ~checksum:true "3rv5k187";;
 86      (* Result: 123456789L (or raises Decode_error if checksum invalid) *)
 87    ]}
 88
 89    {1 API Documentation} *)
 90
 91(** {1 ID Generation}
 92
 93    Generate random identifiers in Crockford base32 format. This is useful for
 94    creating unique, human-readable IDs for databases, URLs, or user-facing
 95    reference numbers. *)
 96
 97val generate :
 98  length:int ->
 99  ?split_every:int ->
100  ?checksum:bool ->
101  ?rng:(float -> float) ->
102  unit -> string
103(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
104
105    This function creates a random identifier by generating a random integer and
106    encoding it using the Crockford base32 alphabet. The generated IDs are suitable
107    for use as database keys, URL-safe identifiers, or user-visible reference numbers.
108
109    When using the default [Random.float] generator, you must initialize the
110    random number generator with [Random.self_init] before calling this function.
111
112    @param length The target length of the generated string. When [checksum:false],
113                  this is the exact output length. When [checksum:true], this is the
114                  total length including the 2-character checksum, so the random
115                  portion will be [length - 2] characters.
116    @param split_every Insert hyphens every N characters for improved readability.
117                       For example, [split_every]=[4] might produce [3a7k-m9n2].
118                       Default: no splitting.
119    @param checksum Append a 2-character ISO 7064 checksum for error detection.
120                    Useful when IDs will be manually transcribed. When [true],
121                    the total output length (including checksum) will be [length].
122                    Default: [false].
123    @param rng Custom random number generator function that takes a float bound and
124               returns a random float in the range [0,bound]. This allows for
125               deterministic testing or custom entropy sources. Defaults to using [Random.float].
126    @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
127           as at least 1 character is needed for the ID and 2 for the checksum.  *)
128
129
130(** {1 Error Types} *)
131
132type invalid_length = { length: int; message: string }
133(** Error for invalid length parameters *)
134
135type invalid_character = { char: char; message: string }
136(** Error for invalid characters during decoding *)
137
138type invalid_checksum = { checksum: string; message: string }
139(** Error for invalid checksum format *)
140
141type checksum_mismatch = { expected: int64; got: int64; identifier: string }
142(** Error for checksum validation failures *)
143
144type decode_error =
145  | Invalid_length of invalid_length
146  | Invalid_character of invalid_character
147  | Invalid_checksum of invalid_checksum
148  | Checksum_mismatch of checksum_mismatch
149(** Union of all possible decode errors *)
150
151exception Decode_error of decode_error
152(** Main exception raised for all decoding errors *)
153
154val pp_invalid_length : Format.formatter -> invalid_length -> unit
155(** Pretty-print an invalid_length error *)
156
157val pp_invalid_character : Format.formatter -> invalid_character -> unit
158(** Pretty-print an invalid_character error *)
159
160val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit
161(** Pretty-print an invalid_checksum error *)
162
163val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit
164(** Pretty-print a checksum_mismatch error *)
165
166val pp_decode_error : Format.formatter -> decode_error -> unit
167(** Pretty-print a decode_error *)
168
169(** {1 Constants} *)
170
171val encoding_chars : string
172(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
173
174    This 32-character alphabet excludes I, L, O, and U to prevent confusion with
175    visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
176    for decoding but returned in lowercase by encoding functions. *)
177
178(** {1 Encoding and Decoding}
179
180    The core encoding and decoding functions convert between 64-bit integers and
181    their Crockford base32 string representations. *)
182
183val encode :
184  ?split_every:int ->
185  ?min_length:int ->
186  ?checksum:bool ->
187  int64 -> string
188(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
189
190    The function converts a 64-bit integer into a base32 representation using the
191    Crockford alphabet. The encoding process divides the number by 32 repeatedly,
192    using the remainder as an index into the alphabet.
193
194    @param split_every Insert hyphens every N characters for readability. For example,
195                       [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
196                       are ignored during decoding. Default: no splitting.
197    @param min_length Pad the output with leading zeros to reach this minimum length.
198                      When [checksum:true], the minimum length includes the 2-character
199                      checksum. Default: no padding.
200    @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
201                    errors. The checksum is computed on the original number and encoded
202                    as two additional base32 characters. Default: [false].
203
204    {b Examples:}
205    {[
206      encode 0L;;                            (* "0" *)
207      encode 1234L;;                         (* "16j" *)
208      encode ~min_length:6 1234L;;           (* "00016j" *)
209      encode ~split_every:3 123456L;;        (* "3rv-5k" *)
210      encode ~checksum:true 1234L;;          (* "16j48" *)
211      encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
212    ]} *)
213
214val decode : ?checksum:bool -> string -> int64
215(** [decode ?checksum str] decodes a Crockford base32 string to int64.
216
217    The function is designed to be forgiving of human input:
218    - Case-insensitive: accepts both uppercase and lowercase letters
219    - Strips hyphens automatically
220    - Maps confusable characters: I/i and L/l → 1, O/o → 0
221
222    @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
223                    If [true], the function verifies that the checksum matches the
224                    decoded value. Default: [false].
225
226    @raise Decode_error with one of the following variants:
227    - [Invalid_character] if an unrecognized character is encountered
228    - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
229    - [Checksum_mismatch] if the checksum doesn't match the decoded value
230
231    {b Examples:}
232    {[
233      decode "16j";;                    (* 1234L *)
234      decode "16J";;                    (* 1234L - case insensitive *)
235      decode "1-6-j";;                  (* 1234L - hyphens ignored *)
236      decode "I6j";;                    (* 1234L - 'I' mapped to '1' *)
237      decode ~checksum:true "16j48";;   (* 1234L - with checksum validation *)
238    ]} *)
239
240(** {1 Utility Functions}
241
242    Low-level functions for working with Crockford base32 strings and checksums. *)
243
244val normalize : string -> string
245(** [normalize str] normalizes a string for decoding.
246
247    This function prepares a potentially messy human input string for decoding by:
248    - Converting all characters to lowercase
249    - Removing all hyphens ([-])
250    - Mapping confusable characters: [I] and [L] → [1], [O] → [0]
251
252    This is automatically called by {!decode}, but is exposed for cases where
253    you want to normalize strings before storage or comparison.
254
255    {b Examples:}
256    {[
257      normalize "ABC-123";;  (* "abc123" *)
258      normalize "IlO";;      (* "110" - confusables mapped *)
259      normalize "A-B-C";;    (* "abc" - hyphens removed *)
260      normalize "HELLO";;    (* "he110" - 'L's and 'O' mapped *)
261    ]} *)
262
263val validate : int64 -> checksum:int64 -> bool
264(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
265
266    This function computes the ISO 7064 (mod 97-10) checksum for the given number
267    and compares it with the provided checksum value.
268
269    @param n The integer value to validate
270    @param checksum The expected checksum value (0-96)
271    @return [true] if the checksum is valid, [false] otherwise
272
273    {b Examples:}
274    {[
275      let cs = generate_checksum 1234L in
276      validate 1234L ~checksum:cs;;  (* true *)
277      validate 1234L ~checksum:99L;; (* false *)
278    ]} *)
279
280val generate_checksum : int64 -> int64
281(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
282
283    The ISO 7064 algorithm provides a checksum that can detect:
284    - All single-digit errors
285    - Most adjacent transposition errors
286    - Most twin errors (where two identical digits are replaced by two other identical digits)
287
288    The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
289
290    {b Examples:}
291    {[
292      generate_checksum 0L;;      (* 1L *)
293      generate_checksum 1234L;;   (* 48L *)
294      generate_checksum 123456L;; (* 87L *)
295    ]} *)