Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Character encoding detection and handling *)
7
8type t = [
9 | `Utf8
10 | `Utf16be
11 | `Utf16le
12 | `Utf32be
13 | `Utf32le
14]
15
16let to_string = function
17 | `Utf8 -> "UTF-8"
18 | `Utf16be -> "UTF-16BE"
19 | `Utf16le -> "UTF-16LE"
20 | `Utf32be -> "UTF-32BE"
21 | `Utf32le -> "UTF-32LE"
22
23let pp fmt t =
24 Format.pp_print_string fmt (to_string t)
25
26(** Detect encoding from BOM or first bytes.
27 Returns (encoding, bom_length) *)
28let detect s =
29 let len = String.length s in
30 if len = 0 then (`Utf8, 0)
31 else
32 let b0 = Char.code s.[0] in
33 let b1 = if len > 1 then Char.code s.[1] else 0 in
34 let b2 = if len > 2 then Char.code s.[2] else 0 in
35 let b3 = if len > 3 then Char.code s.[3] else 0 in
36 match (b0, b1, b2, b3) with
37 (* BOM patterns *)
38 | (0xEF, 0xBB, 0xBF, _) -> (`Utf8, 3)
39 | (0xFE, 0xFF, _, _) -> (`Utf16be, 2)
40 | (0xFF, 0xFE, 0x00, 0x00) -> (`Utf32le, 4)
41 | (0xFF, 0xFE, _, _) -> (`Utf16le, 2)
42 | (0x00, 0x00, 0xFE, 0xFF) -> (`Utf32be, 4)
43 (* Content pattern detection (no BOM) *)
44 | (0x00, 0x00, 0x00, b3) when b3 <> 0x00 -> (`Utf32be, 0)
45 | (b0, 0x00, 0x00, 0x00) when b0 <> 0x00 -> (`Utf32le, 0)
46 | (0x00, b1, _, _) when b1 <> 0x00 -> (`Utf16be, 0)
47 | (b0, 0x00, _, _) when b0 <> 0x00 -> (`Utf16le, 0)
48 | _ -> (`Utf8, 0)
49
50let equal a b = a = b