Pure OCaml Yaml 1.2 reader and writer using Bytesrw
at main 8.3 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Character input source with lookahead, based on Bytes.Reader.t 7 8 This module wraps a bytesrw [Bytes.Reader.t] to provide 9 character-by-character access with lookahead for the YAML scanner. Uses 10 bytesrw's sniff and push_back for efficient lookahead without excessive 11 copying. 12 13 The same input type works with any reader source: strings, files, channels, 14 or streaming sources like Eio. *) 15 16open Bytesrw 17 18include Char_class 19(** Re-export character classification *) 20 21type t = { 22 reader : Bytes.Reader.t; 23 mutable current_slice : Bytes.Slice.t option; 24 (** Current slice being consumed *) 25 mutable slice_pos : int; (** Position within current slice *) 26 mutable position : Position.t; (** Line/column tracking *) 27} 28 29(** Ensure we have a current slice. Returns true if data available. *) 30let ensure_slice t = 31 match t.current_slice with 32 | Some slice when t.slice_pos < Bytes.Slice.length slice -> true 33 | _ -> 34 let slice = Bytes.Reader.read t.reader in 35 if Bytes.Slice.is_eod slice then begin 36 t.current_slice <- None; 37 false 38 end 39 else begin 40 t.current_slice <- Some slice; 41 t.slice_pos <- 0; 42 true 43 end 44 45(** Get current character without advancing *) 46let peek_current t = 47 match t.current_slice with 48 | Some slice when t.slice_pos < Bytes.Slice.length slice -> 49 let bytes = Bytes.Slice.bytes slice in 50 let first = Bytes.Slice.first slice in 51 Some (Stdlib.Bytes.get bytes (first + t.slice_pos)) 52 | _ -> None 53 54(** Create input from a Bytes.Reader.t *) 55let of_reader ?(initial_position = Position.initial) reader = 56 let t = 57 { reader; current_slice = None; slice_pos = 0; position = initial_position } 58 in 59 (* Use sniff for BOM detection - this is exactly what sniff is for *) 60 let sample = Bytes.Reader.sniff 4 t.reader in 61 let bom_len = 62 if 63 String.length sample >= 3 64 && sample.[0] = '\xEF' 65 && sample.[1] = '\xBB' 66 && sample.[2] = '\xBF' 67 then 3 (* UTF-8 BOM *) 68 else 0 69 in 70 (* Skip BOM if present *) 71 if bom_len > 0 then Bytes.Reader.skip bom_len t.reader; 72 t 73 74(** Create input from a string *) 75let of_string s = 76 let reader = Bytes.Reader.of_string s in 77 of_reader reader 78 79let position t = t.position 80let is_eof t = not (ensure_slice t) 81let peek t = if ensure_slice t then peek_current t else None 82 83let peek_exn t = 84 match peek t with 85 | Some c -> c 86 | None -> Error.raise_at t.position Unexpected_eof 87 88(** Peek at nth character (0-indexed from current position) *) 89let peek_nth t n = 90 if n = 0 then peek t 91 else begin 92 (* Use sniff for lookahead - it pushes back automatically *) 93 let sample = Bytes.Reader.sniff (n + 1) t.reader in 94 (* But sniff reads from reader, and we may have a current slice. 95 We need to account for what's already in current_slice *) 96 match t.current_slice with 97 | Some slice -> 98 let slice_bytes = Bytes.Slice.bytes slice in 99 let slice_first = Bytes.Slice.first slice in 100 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in 101 if n < slice_remaining then 102 Some (Stdlib.Bytes.get slice_bytes (slice_first + t.slice_pos + n)) 103 else begin 104 (* Need to look beyond current slice *) 105 let sample_offset = n - slice_remaining in 106 if sample_offset < String.length sample then 107 Some sample.[sample_offset] 108 else None 109 end 110 | None -> if n < String.length sample then Some sample.[n] else None 111 end 112 113(** Peek at up to n characters as a string *) 114let rec peek_string t n = 115 if n <= 0 then "" 116 else begin 117 match t.current_slice with 118 | Some slice -> 119 let slice_bytes = Bytes.Slice.bytes slice in 120 let slice_first = Bytes.Slice.first slice in 121 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in 122 if n <= slice_remaining then 123 (* All within current slice *) 124 Stdlib.Bytes.sub_string slice_bytes (slice_first + t.slice_pos) n 125 else begin 126 (* Need data from beyond current slice - use sniff *) 127 let needed_from_reader = n - slice_remaining in 128 let sample = Bytes.Reader.sniff needed_from_reader t.reader in 129 let buf = Buffer.create n in 130 Buffer.add_subbytes buf slice_bytes 131 (slice_first + t.slice_pos) 132 slice_remaining; 133 Buffer.add_string buf sample; 134 Buffer.contents buf 135 end 136 | None -> if ensure_slice t then peek_string t n else "" 137 end 138 139(** Consume next character *) 140let next t = 141 if ensure_slice t then begin 142 match t.current_slice with 143 | Some slice -> 144 let bytes = Bytes.Slice.bytes slice in 145 let first = Bytes.Slice.first slice in 146 let c = Stdlib.Bytes.get bytes (first + t.slice_pos) in 147 t.slice_pos <- t.slice_pos + 1; 148 t.position <- Position.advance_char c t.position; 149 (* Check if we've exhausted this slice *) 150 if t.slice_pos >= Bytes.Slice.length slice then t.current_slice <- None; 151 Some c 152 | None -> None 153 end 154 else None 155 156let next_exn t = 157 match next t with 158 | Some c -> c 159 | None -> Error.raise_at t.position Unexpected_eof 160 161let skip t n = 162 for _ = 1 to n do 163 ignore (next t) 164 done 165 166let skip_while t pred = 167 let rec loop () = 168 match peek t with 169 | Some c when pred c -> 170 ignore (next t); 171 loop () 172 | _ -> () 173 in 174 loop () 175 176(** Check if next char satisfies predicate *) 177let next_is pred t = match peek t with None -> false | Some c -> pred c 178 179let next_is_break t = next_is is_break t 180let next_is_blank t = next_is is_blank t 181let next_is_whitespace t = next_is is_whitespace t 182let next_is_digit t = next_is is_digit t 183let next_is_hex t = next_is is_hex t 184let next_is_alpha t = next_is is_alpha t 185let next_is_indicator t = next_is is_indicator t 186 187(** Check if at document boundary (--- or ...) *) 188let at_document_boundary t = 189 if t.position.column <> 1 then false 190 else begin 191 let s = peek_string t 4 in 192 let len = String.length s in 193 if len < 3 then false 194 else 195 let prefix = String.sub s 0 3 in 196 (prefix = "---" || prefix = "...") && (len = 3 || is_whitespace s.[3]) 197 end 198 199(** Consume line break, handling \r\n as single break *) 200let consume_break t = 201 match peek t with 202 | Some '\r' -> ( 203 ignore (next t); 204 match peek t with Some '\n' -> ignore (next t) | _ -> ()) 205 | Some '\n' -> ignore (next t) 206 | _ -> () 207 208(** Get remaining content from current position *) 209let remaining t = 210 let buf = Buffer.create 256 in 211 (* Add current slice remainder *) 212 (match t.current_slice with 213 | Some slice -> 214 let bytes = Bytes.Slice.bytes slice in 215 let first = Bytes.Slice.first slice in 216 let remaining = Bytes.Slice.length slice - t.slice_pos in 217 if remaining > 0 then 218 Buffer.add_subbytes buf bytes (first + t.slice_pos) remaining 219 | None -> ()); 220 (* Add remaining from reader *) 221 Bytes.Reader.add_to_buffer buf t.reader; 222 Buffer.contents buf 223 224(** Mark current position for span creation *) 225let mark t = t.position 226 227(** Get the character before the current position (limited lookahead) *) 228let peek_back t = 229 match t.current_slice with 230 | Some slice when t.slice_pos > 0 -> 231 let bytes = Bytes.Slice.bytes slice in 232 let first = Bytes.Slice.first slice in 233 Some (Stdlib.Bytes.get bytes (first + t.slice_pos - 1)) 234 | _ -> None 235 236(** Get a sample of the source for encoding detection. Uses sniff to peek 237 without consuming. *) 238let source t = 239 (* First check current slice *) 240 match t.current_slice with 241 | Some slice -> 242 let bytes = Bytes.Slice.bytes slice in 243 let first = Bytes.Slice.first slice in 244 let available = min 4 (Bytes.Slice.length slice - t.slice_pos) in 245 Stdlib.Bytes.sub_string bytes (first + t.slice_pos) available 246 | None -> 247 (* Use sniff to peek at reader *) 248 Bytes.Reader.sniff 4 t.reader 249 250(** Get the byte position in the underlying stream *) 251let byte_pos t = Bytes.Reader.pos t.reader