Pure OCaml Yaml 1.2 reader and writer using Bytesrw
at main 1.5 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Character encoding detection and handling *) 7 8type t = [ `Utf8 | `Utf16be | `Utf16le | `Utf32be | `Utf32le ] 9 10let to_string = function 11 | `Utf8 -> "UTF-8" 12 | `Utf16be -> "UTF-16BE" 13 | `Utf16le -> "UTF-16LE" 14 | `Utf32be -> "UTF-32BE" 15 | `Utf32le -> "UTF-32LE" 16 17let pp fmt t = Format.pp_print_string fmt (to_string t) 18 19(** Detect encoding from BOM or first bytes. Returns (encoding, bom_length) *) 20let detect s = 21 let len = String.length s in 22 if len = 0 then (`Utf8, 0) 23 else 24 let b0 = Char.code s.[0] in 25 let b1 = if len > 1 then Char.code s.[1] else 0 in 26 let b2 = if len > 2 then Char.code s.[2] else 0 in 27 let b3 = if len > 3 then Char.code s.[3] else 0 in 28 match (b0, b1, b2, b3) with 29 (* BOM patterns *) 30 | 0xEF, 0xBB, 0xBF, _ -> (`Utf8, 3) 31 | 0xFE, 0xFF, _, _ -> (`Utf16be, 2) 32 | 0xFF, 0xFE, 0x00, 0x00 -> (`Utf32le, 4) 33 | 0xFF, 0xFE, _, _ -> (`Utf16le, 2) 34 | 0x00, 0x00, 0xFE, 0xFF -> (`Utf32be, 4) 35 (* Content pattern detection (no BOM) *) 36 | 0x00, 0x00, 0x00, b3 when b3 <> 0x00 -> (`Utf32be, 0) 37 | b0, 0x00, 0x00, 0x00 when b0 <> 0x00 -> (`Utf32le, 0) 38 | 0x00, b1, _, _ when b1 <> 0x00 -> (`Utf16be, 0) 39 | b0, 0x00, _, _ when b0 <> 0x00 -> (`Utf16le, 0) 40 | _ -> (`Utf8, 0) 41 42let equal a b = a = b