(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy. All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (** Crockford Base32 encoding for OCaml {1 Overview} Crockford Base32 is a base-32 encoding scheme designed by Douglas Crockford for human-readable identifiers. It is particularly well-suited for use in URLs, user-facing identifiers, and systems where humans need to transcribe or communicate encoded values. It features: {ul {- {b Human-optimized alphabet}: Uses 32 characters (0-9, A-Z) but excludes letters that are easily confused: I, L, O, and U. This prevents common transcription errors.} {- {b Case-insensitive}: Both uppercase and lowercase letters are accepted during decoding, making it forgiving of human input.} {- {b Confusable character mapping}: When decoding, the letters I and L are automatically mapped to 1, and O is mapped to 0, further reducing transcription errors.} {- {b Hyphenation support}: Hyphens can be included for readability and are automatically ignored during decoding.} {- {b Optional checksums}: Supports ISO 7064 (mod 97-10) checksums to detect transcription errors. The checksum is encoded as two additional characters.} {- {b URL-safe}: All characters in the encoding are safe for use in URLs without escaping.} } {2 The Encoding Alphabet} The 32-character alphabet is: [0123456789ABCDEFGHJKMNPQRSTVWXYZ] Note the absence of I, L, O, and U to avoid confusion with 1, 1, 0, and V respectively. {2 Comparison with Other Encodings} {ul {- {b vs. Base64}: Crockford Base32 is more human-friendly due to its reduced character set and case-insensitivity, though it produces slightly longer strings (base32 uses 5 bits per character vs base64's 6 bits).} {- {b vs. Hexadecimal}: Base32 produces shorter strings than hex (which uses only 4 bits per character) and includes more letters for better distribution.} {- {b vs. Standard Base32 (RFC 4648)}: Crockford's variant is specifically optimized for human readability with its character mappings and exclusions.} } {1 Examples} {[ (* Basic encoding *) let id = encode 123456789L;; (* Result: "3rv5k1" *) (* Encoding with hyphenation for readability *) let id = encode ~split_every:4 123456789L;; (* Result: "3rv5-k1" *) (* Encoding with checksum for error detection *) let id = encode ~checksum:true 123456789L;; (* Result: "3rv5k187" (last two digits are checksum) *) (* Generate a random 8-character identifier *) Random.self_init ();; let random_id = generate ~length:8 ();; (* Result: something like "n4g9k2c7" *) (* Generate with checksum and hyphenation *) let safe_id = generate ~length:10 ~split_every:5 ~checksum:true ();; (* Result: something like "a3k2x-9m4c82" *) (* Decoding is case-insensitive and ignores hyphens *) let n = decode "3RV5-K1";; (* Result: 123456789L *) (* Decode with checksum validation *) let n = decode ~checksum:true "3rv5k187";; (* Result: 123456789L (or raises Decode_error if checksum invalid) *) ]} {1 API Documentation} *) (** {1 ID Generation} Generate random identifiers in Crockford base32 format. This is useful for creating unique, human-readable IDs for databases, URLs, or user-facing reference numbers. *) val generate : length:int -> ?split_every:int -> ?checksum:bool -> ?rng:(float -> float) -> unit -> string (** [generate ~length ?split_every ?checksum ?rng ()] generates a random Crockford base32 string. This function creates a random identifier by generating a random integer and encoding it using the Crockford base32 alphabet. The generated IDs are suitable for use as database keys, URL-safe identifiers, or user-visible reference numbers. When using the default [Random.float] generator, you must initialize the random number generator with {!Random.self_init} before calling this function. @param length The target length of the generated string. When [checksum:false], this is the exact output length. When [checksum:true], this is the total length including the 2-character checksum, so the random portion will be [length - 2] characters. @param split_every Insert hyphens every N characters for improved readability. For example, [split_every]=[4] might produce [3a7k-m9n2]. Default: no splitting. @param checksum Append a 2-character ISO 7064 checksum for error detection. Useful when IDs will be manually transcribed. When [true], the total output length (including checksum) will be [length]. Default: [false]. @param rng Custom random number generator function that takes a float bound and returns a random float in the range [0,bound]. This allows for deterministic testing or custom entropy sources. Defaults to using {!Random.float}. @raise Decode_error with [Invalid_length] if [checksum] is [true] and [length < 3] as at least 1 character is needed for the ID and 2 for the checksum. *) (** {1 Error Types} *) type invalid_length = { length: int; message: string } (** Error for invalid length parameters *) type invalid_character = { char: char; message: string } (** Error for invalid characters during decoding *) type invalid_checksum = { checksum: string; message: string } (** Error for invalid checksum format *) type checksum_mismatch = { expected: int64; got: int64; identifier: string } (** Error for checksum validation failures *) type decode_error = | Invalid_length of invalid_length | Invalid_character of invalid_character | Invalid_checksum of invalid_checksum | Checksum_mismatch of checksum_mismatch (** Union of all possible decode errors *) exception Decode_error of decode_error (** Main exception raised for all decoding errors *) val pp_invalid_length : Format.formatter -> invalid_length -> unit (** Pretty-print an invalid_length error *) val pp_invalid_character : Format.formatter -> invalid_character -> unit (** Pretty-print an invalid_character error *) val pp_invalid_checksum : Format.formatter -> invalid_checksum -> unit (** Pretty-print an invalid_checksum error *) val pp_checksum_mismatch : Format.formatter -> checksum_mismatch -> unit (** Pretty-print a checksum_mismatch error *) val pp_decode_error : Format.formatter -> decode_error -> unit (** Pretty-print a decode_error *) (** {1 Constants} *) val encoding_chars : string (** The Crockford base32 encoding alphabet: ["0123456789abcdefghjkmnpqrstvwxyz"] This 32-character alphabet excludes I, L, O, and U to prevent confusion with visually similar characters (1, 1, 0, and V). The alphabet is case-insensitive for decoding but returned in lowercase by encoding functions. *) (** {1 Encoding and Decoding} The core encoding and decoding functions convert between 64-bit integers and their Crockford base32 string representations. *) val encode : ?split_every:int -> ?min_length:int -> ?checksum:bool -> int64 -> string (** [encode ?split_every ?min_length ?checksum n] encodes an int64 to a Crockford base32 string. The function converts a 64-bit integer into a base32 representation using the Crockford alphabet. The encoding process divides the number by 32 repeatedly, using the remainder as an index into the alphabet. @param split_every Insert hyphens every N characters for readability. For example, [split_every:4] converts ["abcd1234"] to ["abcd-1234"]. Hyphens are ignored during decoding. Default: no splitting. @param min_length Pad the output with leading zeros to reach this minimum length. When [checksum:true], the minimum length includes the 2-character checksum. Default: no padding. @param checksum Append a 2-digit ISO 7064 (mod 97-10) checksum to detect transcription errors. The checksum is computed on the original number and encoded as two additional base32 characters. Default: [false]. {b Examples:} {[ encode 0L;; (* "0" *) encode 1234L;; (* "16j" *) encode ~min_length:6 1234L;; (* "00016j" *) encode ~split_every:3 123456L;; (* "3rv-5k" *) encode ~checksum:true 1234L;; (* "16j48" *) encode ~min_length:8 ~checksum:true ~split_every:4 1234L;; (* "0016-j448" *) ]} *) val decode : ?checksum:bool -> string -> int64 (** [decode ?checksum str] decodes a Crockford base32 string to int64. The function is designed to be forgiving of human input: - Case-insensitive: accepts both uppercase and lowercase letters - Strips hyphens automatically - Maps confusable characters: I/i and L/l → 1, O/o → 0 @param checksum Expect and validate the last 2 characters as an ISO 7064 checksum. If [true], the function verifies that the checksum matches the decoded value. Default: [false]. @raise Decode_error with one of the following variants: - [Invalid_character] if an unrecognized character is encountered - [Invalid_checksum] if [checksum:true] but the string is too short or checksum has invalid format - [Checksum_mismatch] if the checksum doesn't match the decoded value {b Examples:} {[ decode "16j";; (* 1234L *) decode "16J";; (* 1234L - case insensitive *) decode "1-6-j";; (* 1234L - hyphens ignored *) decode "I6j";; (* 1234L - 'I' mapped to '1' *) decode ~checksum:true "16j48";; (* 1234L - with checksum validation *) ]} *) (** {1 Utility Functions} Low-level functions for working with Crockford base32 strings and checksums. *) val normalize : string -> string (** [normalize str] normalizes a string for decoding. This function prepares a potentially messy human input string for decoding by: - Converting all characters to lowercase - Removing all hyphens ([-]) - Mapping confusable characters: [I] and [L] → [1], [O] → [0] This is automatically called by {!decode}, but is exposed for cases where you want to normalize strings before storage or comparison. {b Examples:} {[ normalize "ABC-123";; (* "abc123" *) normalize "IlO";; (* "110" - confusables mapped *) normalize "A-B-C";; (* "abc" - hyphens removed *) normalize "HELLO";; (* "he110" - 'L's and 'O' mapped *) ]} *) val validate : int64 -> checksum:int64 -> bool (** [validate n ~checksum] validates that a checksum matches the expected value for a number. This function computes the ISO 7064 (mod 97-10) checksum for the given number and compares it with the provided checksum value. @param n The integer value to validate @param checksum The expected checksum value (0-96) @return [true] if the checksum is valid, [false] otherwise {b Examples:} {[ let cs = generate_checksum 1234L in validate 1234L ~checksum:cs;; (* true *) validate 1234L ~checksum:99L;; (* false *) ]} *) val generate_checksum : int64 -> int64 (** [generate_checksum n] computes an ISO 7064 (mod 97-10) checksum for a number. The ISO 7064 algorithm provides a checksum that can detect: - All single-digit errors - Most adjacent transposition errors - Most twin errors (where two identical digits are replaced by two other identical digits) The checksum value is in the range 0-96 (or 00-96 when formatted with 2 digits). {b Examples:} {[ generate_checksum 0L;; (* 1L *) generate_checksum 1234L;; (* 48L *) generate_checksum 123456L;; (* 87L *) ]} *)