Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Character encoding detection and handling *)
7
8type t = [ `Utf8 | `Utf16be | `Utf16le | `Utf32be | `Utf32le ]
9
10let to_string = function
11 | `Utf8 -> "UTF-8"
12 | `Utf16be -> "UTF-16BE"
13 | `Utf16le -> "UTF-16LE"
14 | `Utf32be -> "UTF-32BE"
15 | `Utf32le -> "UTF-32LE"
16
17let pp fmt t = Format.pp_print_string fmt (to_string t)
18
19(** Detect encoding from BOM or first bytes. Returns (encoding, bom_length) *)
20let detect s =
21 let len = String.length s in
22 if len = 0 then (`Utf8, 0)
23 else
24 let b0 = Char.code s.[0] in
25 let b1 = if len > 1 then Char.code s.[1] else 0 in
26 let b2 = if len > 2 then Char.code s.[2] else 0 in
27 let b3 = if len > 3 then Char.code s.[3] else 0 in
28 match (b0, b1, b2, b3) with
29 (* BOM patterns *)
30 | 0xEF, 0xBB, 0xBF, _ -> (`Utf8, 3)
31 | 0xFE, 0xFF, _, _ -> (`Utf16be, 2)
32 | 0xFF, 0xFE, 0x00, 0x00 -> (`Utf32le, 4)
33 | 0xFF, 0xFE, _, _ -> (`Utf16le, 2)
34 | 0x00, 0x00, 0xFE, 0xFF -> (`Utf32be, 4)
35 (* Content pattern detection (no BOM) *)
36 | 0x00, 0x00, 0x00, b3 when b3 <> 0x00 -> (`Utf32be, 0)
37 | b0, 0x00, 0x00, 0x00 when b0 <> 0x00 -> (`Utf32le, 0)
38 | 0x00, b1, _, _ when b1 <> 0x00 -> (`Utf16be, 0)
39 | b0, 0x00, _, _ when b0 <> 0x00 -> (`Utf16le, 0)
40 | _ -> (`Utf8, 0)
41
42let equal a b = a = b