Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Character input source with lookahead, based on Bytes.Reader.t
7
8 This module wraps a bytesrw [Bytes.Reader.t] to provide character-by-character
9 access with lookahead for the YAML scanner. Uses bytesrw's sniff and push_back
10 for efficient lookahead without excessive copying.
11
12 The same input type works with any reader source: strings, files, channels,
13 or streaming sources like Eio. *)
14
15open Bytesrw
16
17(** Re-export character classification *)
18include Char_class
19
20type t = {
21 reader : Bytes.Reader.t;
22 mutable current_slice : Bytes.Slice.t option; (** Current slice being consumed *)
23 mutable slice_pos : int; (** Position within current slice *)
24 mutable position : Position.t; (** Line/column tracking *)
25}
26
27(** Ensure we have a current slice. Returns true if data available. *)
28let ensure_slice t =
29 match t.current_slice with
30 | Some slice when t.slice_pos < Bytes.Slice.length slice -> true
31 | _ ->
32 let slice = Bytes.Reader.read t.reader in
33 if Bytes.Slice.is_eod slice then begin
34 t.current_slice <- None;
35 false
36 end else begin
37 t.current_slice <- Some slice;
38 t.slice_pos <- 0;
39 true
40 end
41
42(** Get current character without advancing *)
43let peek_current t =
44 match t.current_slice with
45 | Some slice when t.slice_pos < Bytes.Slice.length slice ->
46 let bytes = Bytes.Slice.bytes slice in
47 let first = Bytes.Slice.first slice in
48 Some (Stdlib.Bytes.get bytes (first + t.slice_pos))
49 | _ -> None
50
51(** Create input from a Bytes.Reader.t *)
52let of_reader ?(initial_position = Position.initial) reader =
53 let t = {
54 reader;
55 current_slice = None;
56 slice_pos = 0;
57 position = initial_position;
58 } in
59 (* Use sniff for BOM detection - this is exactly what sniff is for *)
60 let sample = Bytes.Reader.sniff 4 t.reader in
61 let bom_len =
62 if String.length sample >= 3 &&
63 sample.[0] = '\xEF' &&
64 sample.[1] = '\xBB' &&
65 sample.[2] = '\xBF'
66 then 3 (* UTF-8 BOM *)
67 else 0
68 in
69 (* Skip BOM if present *)
70 if bom_len > 0 then
71 Bytes.Reader.skip bom_len t.reader;
72 t
73
74(** Create input from a string *)
75let of_string s =
76 let reader = Bytes.Reader.of_string s in
77 of_reader reader
78
79let position t = t.position
80
81let is_eof t =
82 not (ensure_slice t)
83
84let peek t =
85 if ensure_slice t then
86 peek_current t
87 else
88 None
89
90let peek_exn t =
91 match peek t with
92 | Some c -> c
93 | None -> Error.raise_at t.position Unexpected_eof
94
95(** Peek at nth character (0-indexed from current position) *)
96let peek_nth t n =
97 if n = 0 then peek t
98 else begin
99 (* Use sniff for lookahead - it pushes back automatically *)
100 let sample = Bytes.Reader.sniff (n + 1) t.reader in
101 (* But sniff reads from reader, and we may have a current slice.
102 We need to account for what's already in current_slice *)
103 match t.current_slice with
104 | Some slice ->
105 let slice_bytes = Bytes.Slice.bytes slice in
106 let slice_first = Bytes.Slice.first slice in
107 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in
108 if n < slice_remaining then
109 Some (Stdlib.Bytes.get slice_bytes (slice_first + t.slice_pos + n))
110 else begin
111 (* Need to look beyond current slice *)
112 let sample_offset = n - slice_remaining in
113 if sample_offset < String.length sample then
114 Some sample.[sample_offset]
115 else
116 None
117 end
118 | None ->
119 if n < String.length sample then
120 Some sample.[n]
121 else
122 None
123 end
124
125(** Peek at up to n characters as a string *)
126let rec peek_string t n =
127 if n <= 0 then ""
128 else begin
129 match t.current_slice with
130 | Some slice ->
131 let slice_bytes = Bytes.Slice.bytes slice in
132 let slice_first = Bytes.Slice.first slice in
133 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in
134 if n <= slice_remaining then
135 (* All within current slice *)
136 Stdlib.Bytes.sub_string slice_bytes (slice_first + t.slice_pos) n
137 else begin
138 (* Need data from beyond current slice - use sniff *)
139 let needed_from_reader = n - slice_remaining in
140 let sample = Bytes.Reader.sniff needed_from_reader t.reader in
141 let buf = Buffer.create n in
142 Buffer.add_subbytes buf slice_bytes (slice_first + t.slice_pos) slice_remaining;
143 Buffer.add_string buf sample;
144 Buffer.contents buf
145 end
146 | None ->
147 if ensure_slice t then
148 peek_string t n
149 else
150 ""
151 end
152
153(** Consume next character *)
154let next t =
155 if ensure_slice t then begin
156 match t.current_slice with
157 | Some slice ->
158 let bytes = Bytes.Slice.bytes slice in
159 let first = Bytes.Slice.first slice in
160 let c = Stdlib.Bytes.get bytes (first + t.slice_pos) in
161 t.slice_pos <- t.slice_pos + 1;
162 t.position <- Position.advance_char c t.position;
163 (* Check if we've exhausted this slice *)
164 if t.slice_pos >= Bytes.Slice.length slice then
165 t.current_slice <- None;
166 Some c
167 | None -> None
168 end else
169 None
170
171let next_exn t =
172 match next t with
173 | Some c -> c
174 | None -> Error.raise_at t.position Unexpected_eof
175
176let skip t n =
177 for _ = 1 to n do
178 ignore (next t)
179 done
180
181let skip_while t pred =
182 let rec loop () =
183 match peek t with
184 | Some c when pred c -> ignore (next t); loop ()
185 | _ -> ()
186 in
187 loop ()
188
189(** Check if next char satisfies predicate *)
190let next_is pred t =
191 match peek t with
192 | None -> false
193 | Some c -> pred c
194
195let next_is_break t = next_is is_break t
196let next_is_blank t = next_is is_blank t
197let next_is_whitespace t = next_is is_whitespace t
198let next_is_digit t = next_is is_digit t
199let next_is_hex t = next_is is_hex t
200let next_is_alpha t = next_is is_alpha t
201let next_is_indicator t = next_is is_indicator t
202
203(** Check if at document boundary (--- or ...) *)
204let at_document_boundary t =
205 if t.position.column <> 1 then false
206 else begin
207 let s = peek_string t 4 in
208 let len = String.length s in
209 if len < 3 then false
210 else
211 let prefix = String.sub s 0 3 in
212 (prefix = "---" || prefix = "...") &&
213 (len = 3 || is_whitespace s.[3])
214 end
215
216(** Consume line break, handling \r\n as single break *)
217let consume_break t =
218 match peek t with
219 | Some '\r' ->
220 ignore (next t);
221 (match peek t with
222 | Some '\n' -> ignore (next t)
223 | _ -> ())
224 | Some '\n' ->
225 ignore (next t)
226 | _ -> ()
227
228(** Get remaining content from current position *)
229let remaining t =
230 let buf = Buffer.create 256 in
231 (* Add current slice remainder *)
232 (match t.current_slice with
233 | Some slice ->
234 let bytes = Bytes.Slice.bytes slice in
235 let first = Bytes.Slice.first slice in
236 let remaining = Bytes.Slice.length slice - t.slice_pos in
237 if remaining > 0 then
238 Buffer.add_subbytes buf bytes (first + t.slice_pos) remaining
239 | None -> ());
240 (* Add remaining from reader *)
241 Bytes.Reader.add_to_buffer buf t.reader;
242 Buffer.contents buf
243
244(** Mark current position for span creation *)
245let mark t = t.position
246
247(** Get the character before the current position (limited lookahead) *)
248let peek_back t =
249 match t.current_slice with
250 | Some slice when t.slice_pos > 0 ->
251 let bytes = Bytes.Slice.bytes slice in
252 let first = Bytes.Slice.first slice in
253 Some (Stdlib.Bytes.get bytes (first + t.slice_pos - 1))
254 | _ -> None
255
256(** Get a sample of the source for encoding detection.
257 Uses sniff to peek without consuming. *)
258let source t =
259 (* First check current slice *)
260 match t.current_slice with
261 | Some slice ->
262 let bytes = Bytes.Slice.bytes slice in
263 let first = Bytes.Slice.first slice in
264 let available = min 4 (Bytes.Slice.length slice - t.slice_pos) in
265 Stdlib.Bytes.sub_string bytes (first + t.slice_pos) available
266 | None ->
267 (* Use sniff to peek at reader *)
268 Bytes.Reader.sniff 4 t.reader
269
270(** Get the byte position in the underlying stream *)
271let byte_pos t =
272 Bytes.Reader.pos t.reader