Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Character input source with lookahead, based on Bytes.Reader.t 7 8 This module wraps a bytesrw [Bytes.Reader.t] to provide character-by-character 9 access with lookahead for the YAML scanner. Uses bytesrw's sniff and push_back 10 for efficient lookahead without excessive copying. 11 12 The same input type works with any reader source: strings, files, channels, 13 or streaming sources like Eio. *) 14 15open Bytesrw 16 17(** Re-export character classification *) 18include Char_class 19 20type t = { 21 reader : Bytes.Reader.t; 22 mutable current_slice : Bytes.Slice.t option; (** Current slice being consumed *) 23 mutable slice_pos : int; (** Position within current slice *) 24 mutable position : Position.t; (** Line/column tracking *) 25} 26 27(** Ensure we have a current slice. Returns true if data available. *) 28let ensure_slice t = 29 match t.current_slice with 30 | Some slice when t.slice_pos < Bytes.Slice.length slice -> true 31 | _ -> 32 let slice = Bytes.Reader.read t.reader in 33 if Bytes.Slice.is_eod slice then begin 34 t.current_slice <- None; 35 false 36 end else begin 37 t.current_slice <- Some slice; 38 t.slice_pos <- 0; 39 true 40 end 41 42(** Get current character without advancing *) 43let peek_current t = 44 match t.current_slice with 45 | Some slice when t.slice_pos < Bytes.Slice.length slice -> 46 let bytes = Bytes.Slice.bytes slice in 47 let first = Bytes.Slice.first slice in 48 Some (Stdlib.Bytes.get bytes (first + t.slice_pos)) 49 | _ -> None 50 51(** Create input from a Bytes.Reader.t *) 52let of_reader ?(initial_position = Position.initial) reader = 53 let t = { 54 reader; 55 current_slice = None; 56 slice_pos = 0; 57 position = initial_position; 58 } in 59 (* Use sniff for BOM detection - this is exactly what sniff is for *) 60 let sample = Bytes.Reader.sniff 4 t.reader in 61 let bom_len = 62 if String.length sample >= 3 && 63 sample.[0] = '\xEF' && 64 sample.[1] = '\xBB' && 65 sample.[2] = '\xBF' 66 then 3 (* UTF-8 BOM *) 67 else 0 68 in 69 (* Skip BOM if present *) 70 if bom_len > 0 then 71 Bytes.Reader.skip bom_len t.reader; 72 t 73 74(** Create input from a string *) 75let of_string s = 76 let reader = Bytes.Reader.of_string s in 77 of_reader reader 78 79let position t = t.position 80 81let is_eof t = 82 not (ensure_slice t) 83 84let peek t = 85 if ensure_slice t then 86 peek_current t 87 else 88 None 89 90let peek_exn t = 91 match peek t with 92 | Some c -> c 93 | None -> Error.raise_at t.position Unexpected_eof 94 95(** Peek at nth character (0-indexed from current position) *) 96let peek_nth t n = 97 if n = 0 then peek t 98 else begin 99 (* Use sniff for lookahead - it pushes back automatically *) 100 let sample = Bytes.Reader.sniff (n + 1) t.reader in 101 (* But sniff reads from reader, and we may have a current slice. 102 We need to account for what's already in current_slice *) 103 match t.current_slice with 104 | Some slice -> 105 let slice_bytes = Bytes.Slice.bytes slice in 106 let slice_first = Bytes.Slice.first slice in 107 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in 108 if n < slice_remaining then 109 Some (Stdlib.Bytes.get slice_bytes (slice_first + t.slice_pos + n)) 110 else begin 111 (* Need to look beyond current slice *) 112 let sample_offset = n - slice_remaining in 113 if sample_offset < String.length sample then 114 Some sample.[sample_offset] 115 else 116 None 117 end 118 | None -> 119 if n < String.length sample then 120 Some sample.[n] 121 else 122 None 123 end 124 125(** Peek at up to n characters as a string *) 126let rec peek_string t n = 127 if n <= 0 then "" 128 else begin 129 match t.current_slice with 130 | Some slice -> 131 let slice_bytes = Bytes.Slice.bytes slice in 132 let slice_first = Bytes.Slice.first slice in 133 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in 134 if n <= slice_remaining then 135 (* All within current slice *) 136 Stdlib.Bytes.sub_string slice_bytes (slice_first + t.slice_pos) n 137 else begin 138 (* Need data from beyond current slice - use sniff *) 139 let needed_from_reader = n - slice_remaining in 140 let sample = Bytes.Reader.sniff needed_from_reader t.reader in 141 let buf = Buffer.create n in 142 Buffer.add_subbytes buf slice_bytes (slice_first + t.slice_pos) slice_remaining; 143 Buffer.add_string buf sample; 144 Buffer.contents buf 145 end 146 | None -> 147 if ensure_slice t then 148 peek_string t n 149 else 150 "" 151 end 152 153(** Consume next character *) 154let next t = 155 if ensure_slice t then begin 156 match t.current_slice with 157 | Some slice -> 158 let bytes = Bytes.Slice.bytes slice in 159 let first = Bytes.Slice.first slice in 160 let c = Stdlib.Bytes.get bytes (first + t.slice_pos) in 161 t.slice_pos <- t.slice_pos + 1; 162 t.position <- Position.advance_char c t.position; 163 (* Check if we've exhausted this slice *) 164 if t.slice_pos >= Bytes.Slice.length slice then 165 t.current_slice <- None; 166 Some c 167 | None -> None 168 end else 169 None 170 171let next_exn t = 172 match next t with 173 | Some c -> c 174 | None -> Error.raise_at t.position Unexpected_eof 175 176let skip t n = 177 for _ = 1 to n do 178 ignore (next t) 179 done 180 181let skip_while t pred = 182 let rec loop () = 183 match peek t with 184 | Some c when pred c -> ignore (next t); loop () 185 | _ -> () 186 in 187 loop () 188 189(** Check if next char satisfies predicate *) 190let next_is pred t = 191 match peek t with 192 | None -> false 193 | Some c -> pred c 194 195let next_is_break t = next_is is_break t 196let next_is_blank t = next_is is_blank t 197let next_is_whitespace t = next_is is_whitespace t 198let next_is_digit t = next_is is_digit t 199let next_is_hex t = next_is is_hex t 200let next_is_alpha t = next_is is_alpha t 201let next_is_indicator t = next_is is_indicator t 202 203(** Check if at document boundary (--- or ...) *) 204let at_document_boundary t = 205 if t.position.column <> 1 then false 206 else begin 207 let s = peek_string t 4 in 208 let len = String.length s in 209 if len < 3 then false 210 else 211 let prefix = String.sub s 0 3 in 212 (prefix = "---" || prefix = "...") && 213 (len = 3 || is_whitespace s.[3]) 214 end 215 216(** Consume line break, handling \r\n as single break *) 217let consume_break t = 218 match peek t with 219 | Some '\r' -> 220 ignore (next t); 221 (match peek t with 222 | Some '\n' -> ignore (next t) 223 | _ -> ()) 224 | Some '\n' -> 225 ignore (next t) 226 | _ -> () 227 228(** Get remaining content from current position *) 229let remaining t = 230 let buf = Buffer.create 256 in 231 (* Add current slice remainder *) 232 (match t.current_slice with 233 | Some slice -> 234 let bytes = Bytes.Slice.bytes slice in 235 let first = Bytes.Slice.first slice in 236 let remaining = Bytes.Slice.length slice - t.slice_pos in 237 if remaining > 0 then 238 Buffer.add_subbytes buf bytes (first + t.slice_pos) remaining 239 | None -> ()); 240 (* Add remaining from reader *) 241 Bytes.Reader.add_to_buffer buf t.reader; 242 Buffer.contents buf 243 244(** Mark current position for span creation *) 245let mark t = t.position 246 247(** Get the character before the current position (limited lookahead) *) 248let peek_back t = 249 match t.current_slice with 250 | Some slice when t.slice_pos > 0 -> 251 let bytes = Bytes.Slice.bytes slice in 252 let first = Bytes.Slice.first slice in 253 Some (Stdlib.Bytes.get bytes (first + t.slice_pos - 1)) 254 | _ -> None 255 256(** Get a sample of the source for encoding detection. 257 Uses sniff to peek without consuming. *) 258let source t = 259 (* First check current slice *) 260 match t.current_slice with 261 | Some slice -> 262 let bytes = Bytes.Slice.bytes slice in 263 let first = Bytes.Slice.first slice in 264 let available = min 4 (Bytes.Slice.length slice - t.slice_pos) in 265 Stdlib.Bytes.sub_string bytes (first + t.slice_pos) available 266 | None -> 267 (* Use sniff to peek at reader *) 268 Bytes.Reader.sniff 4 t.reader 269 270(** Get the byte position in the underlying stream *) 271let byte_pos t = 272 Bytes.Reader.pos t.reader