Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Character input source with lookahead, based on Bytes.Reader.t
7
8 This module wraps a bytesrw [Bytes.Reader.t] to provide
9 character-by-character access with lookahead for the YAML scanner. Uses
10 bytesrw's sniff and push_back for efficient lookahead without excessive
11 copying.
12
13 The same input type works with any reader source: strings, files, channels,
14 or streaming sources like Eio. *)
15
16open Bytesrw
17
18include Char_class
19(** Re-export character classification *)
20
21type t = {
22 reader : Bytes.Reader.t;
23 mutable current_slice : Bytes.Slice.t option;
24 (** Current slice being consumed *)
25 mutable slice_pos : int; (** Position within current slice *)
26 mutable position : Position.t; (** Line/column tracking *)
27}
28
29(** Ensure we have a current slice. Returns true if data available. *)
30let ensure_slice t =
31 match t.current_slice with
32 | Some slice when t.slice_pos < Bytes.Slice.length slice -> true
33 | _ ->
34 let slice = Bytes.Reader.read t.reader in
35 if Bytes.Slice.is_eod slice then begin
36 t.current_slice <- None;
37 false
38 end
39 else begin
40 t.current_slice <- Some slice;
41 t.slice_pos <- 0;
42 true
43 end
44
45(** Get current character without advancing *)
46let peek_current t =
47 match t.current_slice with
48 | Some slice when t.slice_pos < Bytes.Slice.length slice ->
49 let bytes = Bytes.Slice.bytes slice in
50 let first = Bytes.Slice.first slice in
51 Some (Stdlib.Bytes.get bytes (first + t.slice_pos))
52 | _ -> None
53
54(** Create input from a Bytes.Reader.t *)
55let of_reader ?(initial_position = Position.initial) reader =
56 let t =
57 { reader; current_slice = None; slice_pos = 0; position = initial_position }
58 in
59 (* Use sniff for BOM detection - this is exactly what sniff is for *)
60 let sample = Bytes.Reader.sniff 4 t.reader in
61 let bom_len =
62 if
63 String.length sample >= 3
64 && sample.[0] = '\xEF'
65 && sample.[1] = '\xBB'
66 && sample.[2] = '\xBF'
67 then 3 (* UTF-8 BOM *)
68 else 0
69 in
70 (* Skip BOM if present *)
71 if bom_len > 0 then Bytes.Reader.skip bom_len t.reader;
72 t
73
74(** Create input from a string *)
75let of_string s =
76 let reader = Bytes.Reader.of_string s in
77 of_reader reader
78
79let position t = t.position
80let is_eof t = not (ensure_slice t)
81let peek t = if ensure_slice t then peek_current t else None
82
83let peek_exn t =
84 match peek t with
85 | Some c -> c
86 | None -> Error.raise_at t.position Unexpected_eof
87
88(** Peek at nth character (0-indexed from current position) *)
89let peek_nth t n =
90 if n = 0 then peek t
91 else begin
92 (* Use sniff for lookahead - it pushes back automatically *)
93 let sample = Bytes.Reader.sniff (n + 1) t.reader in
94 (* But sniff reads from reader, and we may have a current slice.
95 We need to account for what's already in current_slice *)
96 match t.current_slice with
97 | Some slice ->
98 let slice_bytes = Bytes.Slice.bytes slice in
99 let slice_first = Bytes.Slice.first slice in
100 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in
101 if n < slice_remaining then
102 Some (Stdlib.Bytes.get slice_bytes (slice_first + t.slice_pos + n))
103 else begin
104 (* Need to look beyond current slice *)
105 let sample_offset = n - slice_remaining in
106 if sample_offset < String.length sample then
107 Some sample.[sample_offset]
108 else None
109 end
110 | None -> if n < String.length sample then Some sample.[n] else None
111 end
112
113(** Peek at up to n characters as a string *)
114let rec peek_string t n =
115 if n <= 0 then ""
116 else begin
117 match t.current_slice with
118 | Some slice ->
119 let slice_bytes = Bytes.Slice.bytes slice in
120 let slice_first = Bytes.Slice.first slice in
121 let slice_remaining = Bytes.Slice.length slice - t.slice_pos in
122 if n <= slice_remaining then
123 (* All within current slice *)
124 Stdlib.Bytes.sub_string slice_bytes (slice_first + t.slice_pos) n
125 else begin
126 (* Need data from beyond current slice - use sniff *)
127 let needed_from_reader = n - slice_remaining in
128 let sample = Bytes.Reader.sniff needed_from_reader t.reader in
129 let buf = Buffer.create n in
130 Buffer.add_subbytes buf slice_bytes
131 (slice_first + t.slice_pos)
132 slice_remaining;
133 Buffer.add_string buf sample;
134 Buffer.contents buf
135 end
136 | None -> if ensure_slice t then peek_string t n else ""
137 end
138
139(** Consume next character *)
140let next t =
141 if ensure_slice t then begin
142 match t.current_slice with
143 | Some slice ->
144 let bytes = Bytes.Slice.bytes slice in
145 let first = Bytes.Slice.first slice in
146 let c = Stdlib.Bytes.get bytes (first + t.slice_pos) in
147 t.slice_pos <- t.slice_pos + 1;
148 t.position <- Position.advance_char c t.position;
149 (* Check if we've exhausted this slice *)
150 if t.slice_pos >= Bytes.Slice.length slice then t.current_slice <- None;
151 Some c
152 | None -> None
153 end
154 else None
155
156let next_exn t =
157 match next t with
158 | Some c -> c
159 | None -> Error.raise_at t.position Unexpected_eof
160
161let skip t n =
162 for _ = 1 to n do
163 ignore (next t)
164 done
165
166let skip_while t pred =
167 let rec loop () =
168 match peek t with
169 | Some c when pred c ->
170 ignore (next t);
171 loop ()
172 | _ -> ()
173 in
174 loop ()
175
176(** Check if next char satisfies predicate *)
177let next_is pred t = match peek t with None -> false | Some c -> pred c
178
179let next_is_break t = next_is is_break t
180let next_is_blank t = next_is is_blank t
181let next_is_whitespace t = next_is is_whitespace t
182let next_is_digit t = next_is is_digit t
183let next_is_hex t = next_is is_hex t
184let next_is_alpha t = next_is is_alpha t
185let next_is_indicator t = next_is is_indicator t
186
187(** Check if at document boundary (--- or ...) *)
188let at_document_boundary t =
189 if t.position.column <> 1 then false
190 else begin
191 let s = peek_string t 4 in
192 let len = String.length s in
193 if len < 3 then false
194 else
195 let prefix = String.sub s 0 3 in
196 (prefix = "---" || prefix = "...") && (len = 3 || is_whitespace s.[3])
197 end
198
199(** Consume line break, handling \r\n as single break *)
200let consume_break t =
201 match peek t with
202 | Some '\r' -> (
203 ignore (next t);
204 match peek t with Some '\n' -> ignore (next t) | _ -> ())
205 | Some '\n' -> ignore (next t)
206 | _ -> ()
207
208(** Get remaining content from current position *)
209let remaining t =
210 let buf = Buffer.create 256 in
211 (* Add current slice remainder *)
212 (match t.current_slice with
213 | Some slice ->
214 let bytes = Bytes.Slice.bytes slice in
215 let first = Bytes.Slice.first slice in
216 let remaining = Bytes.Slice.length slice - t.slice_pos in
217 if remaining > 0 then
218 Buffer.add_subbytes buf bytes (first + t.slice_pos) remaining
219 | None -> ());
220 (* Add remaining from reader *)
221 Bytes.Reader.add_to_buffer buf t.reader;
222 Buffer.contents buf
223
224(** Mark current position for span creation *)
225let mark t = t.position
226
227(** Get the character before the current position (limited lookahead) *)
228let peek_back t =
229 match t.current_slice with
230 | Some slice when t.slice_pos > 0 ->
231 let bytes = Bytes.Slice.bytes slice in
232 let first = Bytes.Slice.first slice in
233 Some (Stdlib.Bytes.get bytes (first + t.slice_pos - 1))
234 | _ -> None
235
236(** Get a sample of the source for encoding detection. Uses sniff to peek
237 without consuming. *)
238let source t =
239 (* First check current slice *)
240 match t.current_slice with
241 | Some slice ->
242 let bytes = Bytes.Slice.bytes slice in
243 let first = Bytes.Slice.first slice in
244 let available = min 4 (Bytes.Slice.length slice - t.slice_pos) in
245 Stdlib.Bytes.sub_string bytes (first + t.slice_pos) available
246 | None ->
247 (* Use sniff to peek at reader *)
248 Bytes.Reader.sniff 4 t.reader
249
250(** Get the byte position in the underlying stream *)
251let byte_pos t = Bytes.Reader.pos t.reader