Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Character encoding detection and handling *) 7 8type t = [ 9 | `Utf8 10 | `Utf16be 11 | `Utf16le 12 | `Utf32be 13 | `Utf32le 14] 15 16let to_string = function 17 | `Utf8 -> "UTF-8" 18 | `Utf16be -> "UTF-16BE" 19 | `Utf16le -> "UTF-16LE" 20 | `Utf32be -> "UTF-32BE" 21 | `Utf32le -> "UTF-32LE" 22 23let pp fmt t = 24 Format.pp_print_string fmt (to_string t) 25 26(** Detect encoding from BOM or first bytes. 27 Returns (encoding, bom_length) *) 28let detect s = 29 let len = String.length s in 30 if len = 0 then (`Utf8, 0) 31 else 32 let b0 = Char.code s.[0] in 33 let b1 = if len > 1 then Char.code s.[1] else 0 in 34 let b2 = if len > 2 then Char.code s.[2] else 0 in 35 let b3 = if len > 3 then Char.code s.[3] else 0 in 36 match (b0, b1, b2, b3) with 37 (* BOM patterns *) 38 | (0xEF, 0xBB, 0xBF, _) -> (`Utf8, 3) 39 | (0xFE, 0xFF, _, _) -> (`Utf16be, 2) 40 | (0xFF, 0xFE, 0x00, 0x00) -> (`Utf32le, 4) 41 | (0xFF, 0xFE, _, _) -> (`Utf16le, 2) 42 | (0x00, 0x00, 0xFE, 0xFF) -> (`Utf32be, 4) 43 (* Content pattern detection (no BOM) *) 44 | (0x00, 0x00, 0x00, b3) when b3 <> 0x00 -> (`Utf32be, 0) 45 | (b0, 0x00, 0x00, 0x00) when b0 <> 0x00 -> (`Utf32le, 0) 46 | (0x00, b1, _, _) when b1 <> 0x00 -> (`Utf16be, 0) 47 | (b0, 0x00, _, _) when b0 <> 0x00 -> (`Utf16le, 0) 48 | _ -> (`Utf8, 0) 49 50let equal a b = a = b