OCaml library for Crockford's Base32

+ocamlformat and docs

+1
.ocamlformat
···
+
version=0.27.0
+6 -3
bin/roguedoi.ml
···
-
(* roguedoi.ml - Generate random DOI identifiers with Crockford base32 encoding *)
+
(*---------------------------------------------------------------------------
+
Copyright (c) 2025 Anil Madhavapeddy. All rights reserved.
+
SPDX-License-Identifier: MIT
+
---------------------------------------------------------------------------*)
let generate_doi prefix length split =
Random.self_init ();
let suffix = Crockford.generate ~length ~split_every:split ~checksum:true () in
-
Printf.printf "https://doi.org/%s/%s\n%!" prefix suffix
+
Printf.printf "%s/%s\n%!" prefix suffix
let () =
let open Cmdliner in
···
let generate_cmd =
let doc = "Generate a random DOI with Crockford base32 encoding" in
-
let info = Cmd.info "roguedoi" ~version:"0.1.0" ~doc in
+
let info = Cmd.info "roguedoi" ~version:"1.0.0" ~doc in
Cmd.v info Term.(const generate_doi $ prefix $ length $ split)
in
+2 -2
lib/crockford.ml
···
!number
-
let generate ~length ?(split_every=0) ?(checksum=false) () =
+
let generate ~length ?(split_every=0) ?(checksum=false) ?(rng=Random.float) () =
if checksum && length < 3 then
raise (Decode_error (Invalid_length {
length;
···
(* Generate random number between 0 and 32^length *)
let max_val = 32.0 ** float_of_int adjusted_length in
-
let random_num = Int64.of_float (Random.float max_val) in
+
let random_num = Int64.of_float (rng max_val) in
encode ~split_every ~min_length:adjusted_length ~checksum random_num
+221 -25
lib/crockford.mli
···
SPDX-License-Identifier: MIT
---------------------------------------------------------------------------*)
-
(** Crockford Base32 encoding for OCaml *)
+
(** Crockford Base32 encoding for OCaml
+
+
{1 Overview}
+
+
Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford
+
for human-readable identifiers. It is particularly well-suited for use in URLs,
+
user-facing identifiers, and systems where humans need to transcribe or
+
communicate encoded values. It features:
+
+
{ul
+
{- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes
+
letters that are easily confused: I, L, O, and U. This prevents common
+
transcription errors.}
+
{- {b Case-insensitive}: Both uppercase and lowercase letters are accepted
+
during decoding, making it forgiving of human input.}
+
{- {b Confusable character mapping}: When decoding, the letters I and L are
+
automatically mapped to 1, and O is mapped to 0, further reducing
+
transcription errors.}
+
{- {b Hyphenation support}: Hyphens can be included for readability and are
+
automatically ignored during decoding.}
+
{- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect
+
transcription errors. The checksum is encoded as two additional characters.}
+
{- {b URL-safe}: All characters in the encoding are safe for use in URLs
+
without escaping.}
+
}
+
+
{2 The Encoding Alphabet}
+
+
The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ]
+
+
Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V
+
respectively.
+
+
{2 Comparison with Other Encodings}
+
+
{ul
+
{- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced
+
character set and case-insensitivity, though it produces slightly longer
+
strings (base32 uses 5 bits per character vs base64's 6 bits).}
+
{- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses
+
only 4 bits per character) and includes more letters for better distribution.}
+
{- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically
+
optimized for human readability with its character mappings and exclusions.}
+
}
+
+
{1 Examples}
+
+
{[
+
(* Basic encoding *)
+
let id = encode 123456789L;;
+
(* Result: "3rv5k1" *)
+
+
(* Encoding with hyphenation for readability *)
+
let id = encode ~split_every:4 123456789L;;
+
(* Result: "3rv5-k1" *)
+
+
(* Encoding with checksum for error detection *)
+
let id = encode ~checksum:true 123456789L;;
+
(* Result: "3rv5k187" (last two digits are checksum) *)
+
+
(* Generate a random 8-character identifier *)
+
Random.self_init ();;
+
let random_id = generate ~length:8 ();;
+
(* Result: something like "n4g9k2c7" *)
+
+
(* Generate with checksum and hyphenation *)
+
let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();;
+
(* Result: something like "a3k2x-9m4c82" *)
+
+
(* Decoding is case-insensitive and ignores hyphens *)
+
let n = decode "3RV5-K1";;
+
(* Result: 123456789L *)
+
+
(* Decode with checksum validation *)
+
let n = decode ~checksum:true "3rv5k187";;
+
(* Result: 123456789L (or raises Decode_error if checksum invalid) *)
+
]}
+
+
{1 API Documentation} *)
+
+
(** {1 ID Generation}
+
+
Generate random identifiers in Crockford base32 format. This is useful for
+
creating unique, human-readable IDs for databases, URLs, or user-facing
+
reference numbers. *)
+
+
val generate :
+
length:int ->
+
?split_every:int ->
+
?checksum:bool ->
+
?rng:(float -> float) ->
+
unit -> string
+
(** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string.
+
+
This function creates a random identifier by generating a random integer and
+
encoding it using the Crockford base32 alphabet. The generated IDs are suitable
+
for use as database keys, URL-safe identifiers, or user-visible reference numbers.
+
+
When using the default [Random.float] generator, you must initialize the
+
random number generator with {!Random.self_init} before calling this function.
+
+
@param length The target length of the generated string. When [checksum:false],
+
this is the exact output length. When [checksum:true], this is the
+
total length including the 2-character checksum, so the random
+
portion will be [length - 2] characters.
+
@param split_every Insert hyphens every N characters for improved readability.
+
For example, [split_every]=[4] might produce [3a7k-m9n2].
+
Default: no splitting.
+
@param checksum Append a 2-character ISO 7064 checksum for error detection.
+
Useful when IDs will be manually transcribed. When [true],
+
the total output length (including checksum) will be [length].
+
Default: [false].
+
@param rng Custom random number generator function that takes a float bound and
+
returns a random float in the range [0,bound]. This allows for
+
deterministic testing or custom entropy sources. Defaults to using {!Random.float}.
+
@raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3]
+
as at least 1 character is needed for the ID and 2 for the checksum. *)
+
(** {1 Error Types} *)
···
(** {1 Constants} *)
val encoding_chars : string
-
(** The Crockford base32 encoding alphabet (excludes i, l, o, u) *)
+
(** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"]
-
(** {1 Encoding and Decoding} *)
+
This 32-character alphabet excludes I, L, O, and U to prevent confusion with
+
visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive
+
for decoding but returned in lowercase by encoding functions. *)
+
+
(** {1 Encoding and Decoding}
+
+
The core encoding and decoding functions convert between 64-bit integers and
+
their Crockford base32 string representations. *)
val encode :
?split_every:int ->
···
?checksum:bool ->
int64 -> string
(** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string.
-
@param split_every Split the output with '-' every n characters (default: no splitting)
-
@param min_length Pad with zeros to this minimum length (default: no padding)
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false) *)
+
+
The function converts a 64-bit integer into a base32 representation using the
+
Crockford alphabet. The encoding process divides the number by 32 repeatedly,
+
using the remainder as an index into the alphabet.
+
+
@param split_every Insert hyphens every N characters for readability. For example,
+
[split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens
+
are ignored during decoding. Default: no splitting.
+
@param min_length Pad the output with leading zeros to reach this minimum length.
+
When [checksum:true], the minimum length includes the 2-character
+
checksum. Default: no padding.
+
@param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription
+
errors. The checksum is computed on the original number and encoded
+
as two additional base32 characters. Default: [false].
+
+
{b Examples:}
+
{[
+
encode 0L;; (* "0" *)
+
encode 1234L;; (* "16j" *)
+
encode ~min_length:6 1234L;; (* "00016j" *)
+
encode ~split_every:3 123456L;; (* "3rv-5k" *)
+
encode ~checksum:true 1234L;; (* "16j48" *)
+
encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *)
+
]} *)
val decode : ?checksum:bool -> string -> int64
(** [decode ?checksum str] decodes a Crockford base32 string to int64.
-
@param checksum Expect and validate ISO 7064 checksum (default: false)
-
@raise Decode_error if decoding fails (invalid characters, invalid checksum format, or checksum mismatch) *)
-
(** {1 ID Generation} *)
+
The function is designed to be forgiving of human input:
+
- Case-insensitive: accepts both uppercase and lowercase letters
+
- Strips hyphens automatically
+
- Maps confusable characters: I/i and L/l → 1, O/o → 0
+
+
@param checksum Expect and validate the last 2 characters as an ISO 7064 checksum.
+
If [true], the function verifies that the checksum matches the
+
decoded value. Default: [false].
+
+
@raise Decode_error with one of the following variants:
+
- [Invalid_character] if an unrecognized character is encountered
+
- [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format
+
- [Checksum_mismatch] if the checksum doesn't match the decoded value
-
val generate :
-
length:int ->
-
?split_every:int ->
-
?checksum:bool ->
-
unit -> string
-
(** [generate ~length ?split_every ?checksum ()] generates a random Crockford base32 string.
-
@param length The length of the generated string (excluding checksum)
-
@param split_every Split the output with '-' every n characters (default: no splitting)
-
@param checksum Append ISO 7064 checksum as 2 digits (default: false)
-
@raise Decode_error if checksum is true and length < 3
+
{b Examples:}
+
{[
+
decode "16j";; (* 1234L *)
+
decode "16J";; (* 1234L - case insensitive *)
+
decode "1-6-j";; (* 1234L - hyphens ignored *)
+
decode "I6j";; (* 1234L - 'I' mapped to '1' *)
+
decode ~checksum:true "16j48";; (* 1234L - with checksum validation *)
+
]} *)
-
Note: Caller must initialize Random module with {!Random.self_init} before use *)
+
(** {1 Utility Functions}
-
(** {1 Utility Functions} *)
+
Low-level functions for working with Crockford base32 strings and checksums. *)
val normalize : string -> string
-
(** [normalize str] normalizes a string for decoding by converting to lowercase,
-
removing dashes, and mapping confusable characters (i→1, l→1, o→0) *)
+
(** [normalize str] normalizes a string for decoding.
+
+
This function prepares a potentially messy human input string for decoding by:
+
- Converting all characters to lowercase
+
- Removing all hyphens ([-])
+
- Mapping confusable characters: [I] and [L] → [1], [O] → [0]
+
+
This is automatically called by {!decode}, but is exposed for cases where
+
you want to normalize strings before storage or comparison.
+
+
{b Examples:}
+
{[
+
normalize "ABC-123";; (* "abc123" *)
+
normalize "IlO";; (* "110" - confusables mapped *)
+
normalize "A-B-C";; (* "abc" - hyphens removed *)
+
normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *)
+
]} *)
val validate : int64 -> checksum:int64 -> bool
-
(** [validate n ~checksum] validates that a checksum matches the number *)
+
(** [validate n ~checksum] validates that a checksum matches the expected value for a number.
+
+
This function computes the ISO 7064 (mod 97-10) checksum for the given number
+
and compares it with the provided checksum value.
+
+
@param n The integer value to validate
+
@param checksum The expected checksum value (0-96)
+
@return [true] if the checksum is valid, [false] otherwise
+
+
{b Examples:}
+
{[
+
let cs = generate_checksum 1234L in
+
validate 1234L ~checksum:cs;; (* true *)
+
validate 1234L ~checksum:99L;; (* false *)
+
]} *)
val generate_checksum : int64 -> int64
-
(** [generate_checksum n] generates an ISO 7064 (mod 97-10) checksum for a number *)
+
(** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number.
+
+
The ISO 7064 algorithm provides a checksum that can detect:
+
- All single-digit errors
+
- Most adjacent transposition errors
+
- Most twin errors (where two identical digits are replaced by two other identical digits)
+
+
The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits).
+
+
{b Examples:}
+
{[
+
generate_checksum 0L;; (* 1L *)
+
generate_checksum 1234L;; (* 48L *)
+
generate_checksum 123456L;; (* 87L *)
+
]} *)