lib/crockford.mli at v1.0.0 · anil.recoil.org/ocaml-crockford

OCaml library for Crockford's Base32
ocaml-crockford / lib / crockford.mli
at v1.0.0 12 kB view raw
  1(*---------------------------------------------------------------------------
  2   Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
  3   SPDX-License-Identifier: MIT
  4  ---------------------------------------------------------------------------*)
  5
  6(** Crockford Base32 encoding for OCaml
  7
  8    {1 Overview}
  9
 10    Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
 11    for human-readable identifiers. It is particularly well-suited for use in URLs,
 12    user-facing identifiers, and systems where humans need to transcribe or
 13    communicate encoded values. It features:
 14
 15    {ul
 16      {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
 17         letters that are easily confused: I, L, O, and U. This prevents common
 18         transcription errors.}
 19      {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
 20         during decoding, making it forgiving of human input.}
 21      {- {b Confusable character mapping}: When decoding, the letters I and L are
 22         automatically mapped to 1, and O is mapped to 0, further reducing
 23         transcription errors.}
 24      {- {b Hyphenation support}: Hyphens can be included for readability and are
 25         automatically ignored during decoding.}
 26      {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
 27         transcription errors. The checksum is encoded as two additional characters.}
 28      {- {b URL-safe}: All characters in the encoding are safe for use in URLs
 29         without escaping.}
 30    }
 31
 32    {2 The Encoding Alphabet}
 33
 34    The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
 35
 36    Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
 37    respectively.
 38
 39    {2 Comparison with Other Encodings}
 40
 41    {ul
 42      {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
 43         character set and case-insensitivity, though it produces slightly longer
 44         strings (base32 uses 5 bits per character vs base64's 6 bits).}
 45      {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
 46         only 4 bits per character) and includes more letters for better distribution.}
 47      {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
 48         optimized for human readability with its character mappings and exclusions.}
 49    }
 50
 51    {1 Examples}
 52
 53    {[
 54      (* Basic encoding *)
 55      let id = encode 123456789L;;
 56      (* Result: "3rv5k1" *)
 57
 58      (* Encoding with hyphenation for readability *)
 59      let id = encode ~split_every:4 123456789L;;
 60      (* Result: "3rv5-k1" *)
 61
 62      (* Encoding with checksum for error detection *)
 63      let id = encode ~checksum:true 123456789L;;
 64      (* Result: "3rv5k187" (last two digits are checksum) *)
 65
 66      (* Generate a random 8-character identifier *)
 67      Random.self_init ();;
 68      let random_id = generate ~length:8 ();;
 69      (* Result: something like "n4g9k2c7" *)
 70
 71      (* Generate with checksum and hyphenation *)
 72      let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
 73      (* Result: something like "a3k2x-9m4c82" *)
 74
 75      (* Decoding is case-insensitive and ignores hyphens *)
 76      let n = decode "3RV5-K1";;
 77      (* Result: 123456789L *)
 78
 79      (* Decode with checksum validation *)
 80      let n = decode ~checksum:true "3rv5k187";;
 81      (* Result: 123456789L (or raises Decode_error if checksum invalid) *)
 82    ]}
 83
 84    {1 API Documentation} *)
 85
 86(** {1 ID Generation}
 87
 88    Generate random identifiers in Crockford base32 format. This is useful for
 89    creating unique, human-readable IDs for databases, URLs, or user-facing
 90    reference numbers. *)
 91
 92val generate :
 93  length:int ->
 94  ?split_every:int ->
 95  ?checksum:bool ->
 96  ?rng:(float -> float) ->
 97  unit -> string
 98(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
 99
100    This function creates a random identifier by generating a random integer and
101    encoding it using the Crockford base32 alphabet. The generated IDs are suitable
102    for use as database keys, URL-safe identifiers, or user-visible reference numbers.
103
104    When using the default [Random.float] generator, you must initialize the
105    random number generator with {!Random.self_init} before calling this function.
106
107    @param length The target length of the generated string. When [checksum:false],
108                  this is the exact output length. When [checksum:true], this is the
109                  total length including the 2-character checksum, so the random
110                  portion will be [length - 2] characters.
111    @param split_every Insert hyphens every N characters for improved readability.
112                       For example, [split_every]=[4] might produce [3a7k-m9n2].
113                       Default: no splitting.
114    @param checksum Append a 2-character ISO 7064 checksum for error detection.
115                    Useful when IDs will be manually transcribed. When [true],
116                    the total output length (including checksum) will be [length].
117                    Default: [false].
118    @param rng Custom random number generator function that takes a float bound and
119               returns a random float in the range [0,bound]. This allows for
120               deterministic testing or custom entropy sources. Defaults to using {!Random.float}.
121    @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
122           as at least 1 character is needed for the ID and 2 for the checksum.  *)
123
124
125(** {1 Error Types} *)
126
127type invalid_length = { length: int; message: string }
128(** Error for invalid length parameters *)
129
130type invalid_character = { char: char; message: string }
131(** Error for invalid characters during decoding *)
132
133type invalid_checksum = { checksum: string; message: string }
134(** Error for invalid checksum format *)
135
136type checksum_mismatch = { expected: int64; got: int64; identifier: string }
137(** Error for checksum validation failures *)
138
139type decode_error =
140  | Invalid_length of invalid_length
141  | Invalid_character of invalid_character
142  | Invalid_checksum of invalid_checksum
143  | Checksum_mismatch of checksum_mismatch
144(** Union of all possible decode errors *)
145
146exception Decode_error of decode_error
147(** Main exception raised for all decoding errors *)
148
149val pp_invalid_length : Format.formatter -> invalid_length -> unit
150(** Pretty-print an invalid_length error *)
151
152val pp_invalid_character : Format.formatter -> invalid_character -> unit
153(** Pretty-print an invalid_character error *)
154
155val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit
156(** Pretty-print an invalid_checksum error *)
157
158val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit
159(** Pretty-print a checksum_mismatch error *)
160
161val pp_decode_error : Format.formatter -> decode_error -> unit
162(** Pretty-print a decode_error *)
163
164(** {1 Constants} *)
165
166val encoding_chars : string
167(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
168
169    This 32-character alphabet excludes I, L, O, and U to prevent confusion with
170    visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
171    for decoding but returned in lowercase by encoding functions. *)
172
173(** {1 Encoding and Decoding}
174
175    The core encoding and decoding functions convert between 64-bit integers and
176    their Crockford base32 string representations. *)
177
178val encode :
179  ?split_every:int ->
180  ?min_length:int ->
181  ?checksum:bool ->
182  int64 -> string
183(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
184
185    The function converts a 64-bit integer into a base32 representation using the
186    Crockford alphabet. The encoding process divides the number by 32 repeatedly,
187    using the remainder as an index into the alphabet.
188
189    @param split_every Insert hyphens every N characters for readability. For example,
190                       [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
191                       are ignored during decoding. Default: no splitting.
192    @param min_length Pad the output with leading zeros to reach this minimum length.
193                      When [checksum:true], the minimum length includes the 2-character
194                      checksum. Default: no padding.
195    @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
196                    errors. The checksum is computed on the original number and encoded
197                    as two additional base32 characters. Default: [false].
198
199    {b Examples:}
200    {[
201      encode 0L;;                            (* "0" *)
202      encode 1234L;;                         (* "16j" *)
203      encode ~min_length:6 1234L;;           (* "00016j" *)
204      encode ~split_every:3 123456L;;        (* "3rv-5k" *)
205      encode ~checksum:true 1234L;;          (* "16j48" *)
206      encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
207    ]} *)
208
209val decode : ?checksum:bool -> string -> int64
210(** [decode ?checksum str] decodes a Crockford base32 string to int64.
211
212    The function is designed to be forgiving of human input:
213    - Case-insensitive: accepts both uppercase and lowercase letters
214    - Strips hyphens automatically
215    - Maps confusable characters: I/i and L/l → 1, O/o → 0
216
217    @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
218                    If [true], the function verifies that the checksum matches the
219                    decoded value. Default: [false].
220
221    @raise Decode_error with one of the following variants:
222    - [Invalid_character] if an unrecognized character is encountered
223    - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
224    - [Checksum_mismatch] if the checksum doesn't match the decoded value
225
226    {b Examples:}
227    {[
228      decode "16j";;                    (* 1234L *)
229      decode "16J";;                    (* 1234L - case insensitive *)
230      decode "1-6-j";;                  (* 1234L - hyphens ignored *)
231      decode "I6j";;                    (* 1234L - 'I' mapped to '1' *)
232      decode ~checksum:true "16j48";;   (* 1234L - with checksum validation *)
233    ]} *)
234
235(** {1 Utility Functions}
236
237    Low-level functions for working with Crockford base32 strings and checksums. *)
238
239val normalize : string -> string
240(** [normalize str] normalizes a string for decoding.
241
242    This function prepares a potentially messy human input string for decoding by:
243    - Converting all characters to lowercase
244    - Removing all hyphens ([-])
245    - Mapping confusable characters: [I] and [L] → [1], [O] → [0]
246
247    This is automatically called by {!decode}, but is exposed for cases where
248    you want to normalize strings before storage or comparison.
249
250    {b Examples:}
251    {[
252      normalize "ABC-123";;  (* "abc123" *)
253      normalize "IlO";;      (* "110" - confusables mapped *)
254      normalize "A-B-C";;    (* "abc" - hyphens removed *)
255      normalize "HELLO";;    (* "he110" - 'L's and 'O' mapped *)
256    ]} *)
257
258val validate : int64 -> checksum:int64 -> bool
259(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
260
261    This function computes the ISO 7064 (mod 97-10) checksum for the given number
262    and compares it with the provided checksum value.
263
264    @param n The integer value to validate
265    @param checksum The expected checksum value (0-96)
266    @return [true] if the checksum is valid, [false] otherwise
267
268    {b Examples:}
269    {[
270      let cs = generate_checksum 1234L in
271      validate 1234L ~checksum:cs;;  (* true *)
272      validate 1234L ~checksum:99L;; (* false *)
273    ]} *)
274
275val generate_checksum : int64 -> int64
276(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
277
278    The ISO 7064 algorithm provides a checksum that can detect:
279    - All single-digit errors
280    - Most adjacent transposition errors
281    - Most twin errors (where two identical digits are replaced by two other identical digits)
282
283    The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
284
285    {b Examples:}
286    {[
287      generate_checksum 0L;;      (* 1L *)
288      generate_checksum 1234L;;   (* 48L *)
289      generate_checksum 123456L;; (* 87L *)
290    ]} *)