Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** YAML parser - converts tokens to semantic events via state machine *)
7
8(** Parser states *)
9type state =
10 | Stream_start
11 | Implicit_document_start
12 | Document_content
13 | Document_content_done (* After parsing a node, check for unexpected content *)
14 | Document_end
15 | Block_sequence_first_entry
16 | Block_sequence_entry
17 | Indentless_sequence_entry
18 | Block_mapping_first_key
19 | Block_mapping_key
20 | Block_mapping_value
21 | Flow_sequence_first_entry
22 | Flow_sequence_entry
23 | Flow_sequence_entry_mapping_key
24 | Flow_sequence_entry_mapping_value
25 | Flow_sequence_entry_mapping_end
26 | Flow_mapping_first_key
27 | Flow_mapping_key
28 | Flow_mapping_value
29 | End
30
31type t = {
32 scanner : Scanner.t;
33 mutable state : state;
34 mutable states : state list; (** State stack *)
35 mutable version : (int * int) option;
36 mutable tag_directives : (string * string) list;
37 mutable current_token : Token.spanned option;
38 mutable finished : bool;
39 mutable explicit_doc_end : bool; (** True if last doc ended with explicit ... *)
40 mutable stream_start : bool; (** True if we haven't emitted any documents yet *)
41}
42
43let create scanner = {
44 scanner;
45 state = Stream_start;
46 states = [];
47 version = None;
48 tag_directives = [
49 ("!", "!");
50 ("!!", "tag:yaml.org,2002:");
51 ];
52 current_token = None;
53 finished = false;
54 explicit_doc_end = false;
55 stream_start = true;
56}
57
58let of_string s = create (Scanner.of_string s)
59let of_scanner = create
60let of_input i = create (Scanner.of_input i)
61let of_reader r = create (Scanner.of_reader r)
62
63(** Get current token, fetching if needed *)
64let current_token t =
65 match t.current_token with
66 | Some tok -> tok
67 | None ->
68 let tok = Scanner.next t.scanner in
69 t.current_token <- tok;
70 match tok with
71 | Some tok -> tok
72 | None -> Error.raise Unexpected_eof
73
74(** Peek at current token *)
75let peek_token t =
76 match t.current_token with
77 | Some _ -> t.current_token
78 | None ->
79 t.current_token <- Scanner.next t.scanner;
80 t.current_token
81
82(** Skip current token *)
83let skip_token t =
84 t.current_token <- None
85
86(** Check if current token matches predicate *)
87let check t pred =
88 match peek_token t with
89 | Some tok -> pred tok.token
90 | None -> false
91
92(** Push state onto stack *)
93let push_state t s =
94 t.states <- s :: t.states
95
96(** Pop state from stack *)
97let pop_state t =
98 match t.states with
99 | s :: rest ->
100 t.states <- rest;
101 s
102 | [] -> End
103
104(** Resolve a tag *)
105let resolve_tag t ~handle ~suffix =
106 if handle = "" then
107 (* Verbatim tag - suffix is already the full URI *)
108 suffix
109 else
110 match List.assoc_opt handle t.tag_directives with
111 | Some prefix -> prefix ^ suffix
112 | None when handle = "!" -> "!" ^ suffix
113 | None -> Error.raise (Invalid_tag (handle ^ suffix))
114
115(** Process directives at document start *)
116let process_directives t =
117 t.version <- None;
118 t.tag_directives <- [("!", "!"); ("!!", "tag:yaml.org,2002:")];
119
120 while check t (function
121 | Token.Version_directive _ | Token.Tag_directive _ -> true
122 | _ -> false)
123 do
124 let tok = current_token t in
125 skip_token t;
126 match tok.token with
127 | Token.Version_directive { major; minor } ->
128 if t.version <> None then
129 Error.raise_span tok.span (Invalid_yaml_version "duplicate YAML directive");
130 t.version <- Some (major, minor)
131 | Token.Tag_directive { handle; prefix } ->
132 (* Skip empty tag directives (these are reserved/unknown directives that were ignored) *)
133 if handle = "" && prefix = "" then
134 () (* Ignore reserved directives *)
135 else begin
136 if List.mem_assoc handle t.tag_directives &&
137 handle <> "!" && handle <> "!!" then
138 Error.raise_span tok.span (Invalid_tag_directive ("duplicate tag handle: " ^ handle));
139 t.tag_directives <- (handle, prefix) :: t.tag_directives
140 end
141 | _ -> ()
142 done
143
144(** Parse anchor and/or tag properties *)
145let parse_properties t =
146 let anchor = ref None in
147 let tag = ref None in
148
149 while check t (function
150 | Token.Anchor _ | Token.Tag _ -> true
151 | _ -> false)
152 do
153 let tok = current_token t in
154 skip_token t;
155 match tok.token with
156 | Token.Anchor name ->
157 if Option.is_some !anchor then
158 Error.raise_span tok.span (Duplicate_anchor name);
159 anchor := Some name
160 | Token.Tag { handle; suffix } ->
161 if Option.is_some !tag then
162 Error.raise_span tok.span (Invalid_tag "duplicate tag");
163 let resolved =
164 if handle = "" && suffix = "" then None
165 else if handle = "!" && suffix = "" then Some "!"
166 else Some (resolve_tag t ~handle ~suffix)
167 in
168 tag := resolved
169 | _ -> ()
170 done;
171 (!anchor, !tag)
172
173(** Empty scalar event *)
174let empty_scalar_event ~anchor ~tag span =
175 Event.Scalar {
176 anchor;
177 tag;
178 value = "";
179 plain_implicit = tag = None;
180 quoted_implicit = false;
181 style = `Plain;
182 }, span
183
184(** Parse stream start *)
185let parse_stream_start t =
186 let tok = current_token t in
187 skip_token t;
188 match tok.token with
189 | Token.Stream_start encoding ->
190 t.state <- Implicit_document_start;
191 Event.Stream_start { encoding }, tok.span
192 | _ ->
193 Error.raise_span tok.span (Unexpected_token "expected stream start")
194
195(** Parse document start (implicit or explicit) *)
196let parse_document_start t ~implicit =
197 process_directives t;
198
199 if not implicit then begin
200 let tok = current_token t in
201 match tok.token with
202 | Token.Document_start ->
203 skip_token t
204 | _ ->
205 Error.raise_span tok.span Expected_document_start
206 end;
207
208 let span = match peek_token t with
209 | Some tok -> tok.span
210 | None -> Span.point Position.initial
211 in
212
213 (* After first document, stream_start is false *)
214 t.stream_start <- false;
215 push_state t Document_end;
216 t.state <- Document_content;
217 Event.Document_start { version = t.version; implicit }, span
218
219(** Parse document end *)
220let parse_document_end t =
221 let implicit = not (check t (function Token.Document_end -> true | _ -> false)) in
222 let span = match peek_token t with
223 | Some tok -> tok.span
224 | None -> Span.point Position.initial
225 in
226
227 if not implicit then skip_token t;
228
229 (* Track if this document ended explicitly with ... *)
230 t.explicit_doc_end <- not implicit;
231 t.state <- Implicit_document_start;
232 Event.Document_end { implicit }, span
233
234(** Parse node in various contexts *)
235let parse_node t ~block ~indentless =
236 let tok = current_token t in
237 match tok.token with
238 | Token.Alias name ->
239 skip_token t;
240 t.state <- pop_state t;
241 Event.Alias { anchor = name }, tok.span
242
243 | Token.Anchor _ | Token.Tag _ ->
244 let anchor, tag = parse_properties t in
245 let tok = current_token t in
246 (match tok.token with
247 | Token.Block_entry when indentless ->
248 t.state <- Indentless_sequence_entry;
249 Event.Sequence_start {
250 anchor; tag;
251 implicit = tag = None;
252 style = `Block;
253 }, tok.span
254
255 | Token.Block_sequence_start when block ->
256 t.state <- Block_sequence_first_entry;
257 skip_token t;
258 Event.Sequence_start {
259 anchor; tag;
260 implicit = tag = None;
261 style = `Block;
262 }, tok.span
263
264 | Token.Block_mapping_start when block ->
265 t.state <- Block_mapping_first_key;
266 skip_token t;
267 Event.Mapping_start {
268 anchor; tag;
269 implicit = tag = None;
270 style = `Block;
271 }, tok.span
272
273 | Token.Flow_sequence_start ->
274 t.state <- Flow_sequence_first_entry;
275 skip_token t;
276 Event.Sequence_start {
277 anchor; tag;
278 implicit = tag = None;
279 style = `Flow;
280 }, tok.span
281
282 | Token.Flow_mapping_start ->
283 t.state <- Flow_mapping_first_key;
284 skip_token t;
285 Event.Mapping_start {
286 anchor; tag;
287 implicit = tag = None;
288 style = `Flow;
289 }, tok.span
290
291 | Token.Scalar { style; value } ->
292 skip_token t;
293 t.state <- pop_state t;
294 let plain_implicit = tag = None && style = `Plain in
295 let quoted_implicit = tag = None && style <> `Plain in
296 Event.Scalar {
297 anchor; tag; value;
298 plain_implicit; quoted_implicit; style;
299 }, tok.span
300
301 | _ ->
302 (* Empty node *)
303 t.state <- pop_state t;
304 empty_scalar_event ~anchor ~tag tok.span)
305
306 | Token.Block_sequence_start when block ->
307 t.state <- Block_sequence_first_entry;
308 skip_token t;
309 Event.Sequence_start {
310 anchor = None; tag = None;
311 implicit = true;
312 style = `Block;
313 }, tok.span
314
315 | Token.Block_mapping_start when block ->
316 t.state <- Block_mapping_first_key;
317 skip_token t;
318 Event.Mapping_start {
319 anchor = None; tag = None;
320 implicit = true;
321 style = `Block;
322 }, tok.span
323
324 | Token.Flow_sequence_start ->
325 t.state <- Flow_sequence_first_entry;
326 skip_token t;
327 Event.Sequence_start {
328 anchor = None; tag = None;
329 implicit = true;
330 style = `Flow;
331 }, tok.span
332
333 | Token.Flow_mapping_start ->
334 t.state <- Flow_mapping_first_key;
335 skip_token t;
336 Event.Mapping_start {
337 anchor = None; tag = None;
338 implicit = true;
339 style = `Flow;
340 }, tok.span
341
342 | Token.Block_entry when indentless ->
343 t.state <- Indentless_sequence_entry;
344 Event.Sequence_start {
345 anchor = None; tag = None;
346 implicit = true;
347 style = `Block;
348 }, tok.span
349
350 | Token.Scalar { style; value } ->
351 skip_token t;
352 t.state <- pop_state t;
353 let plain_implicit = style = `Plain in
354 let quoted_implicit = style <> `Plain in
355 Event.Scalar {
356 anchor = None; tag = None; value;
357 plain_implicit; quoted_implicit; style;
358 }, tok.span
359
360 | _ ->
361 (* Empty node *)
362 t.state <- pop_state t;
363 empty_scalar_event ~anchor:None ~tag:None tok.span
364
365(** Parse block sequence entry *)
366let parse_block_sequence_entry t =
367 let tok = current_token t in
368 match tok.token with
369 | Token.Block_entry ->
370 skip_token t;
371 if check t (function
372 | Token.Block_entry | Token.Block_end -> true
373 | _ -> false)
374 then begin
375 t.state <- Block_sequence_entry;
376 empty_scalar_event ~anchor:None ~tag:None tok.span
377 end else begin
378 push_state t Block_sequence_entry;
379 parse_node t ~block:true ~indentless:false
380 end
381 | Token.Block_end ->
382 skip_token t;
383 t.state <- pop_state t;
384 Event.Sequence_end, tok.span
385 | _ ->
386 Error.raise_span tok.span Expected_block_entry
387
388(** Parse block mapping key *)
389let parse_block_mapping_key t =
390 let tok = current_token t in
391 match tok.token with
392 | Token.Key ->
393 skip_token t;
394 if check t (function
395 | Token.Key | Token.Value | Token.Block_end -> true
396 | _ -> false)
397 then begin
398 t.state <- Block_mapping_value;
399 empty_scalar_event ~anchor:None ~tag:None tok.span
400 end else begin
401 push_state t Block_mapping_value;
402 parse_node t ~block:true ~indentless:true
403 end
404 (* Handle value without explicit key - key is empty/null *)
405 | Token.Value ->
406 t.state <- Block_mapping_value;
407 empty_scalar_event ~anchor:None ~tag:None tok.span
408 | Token.Block_end ->
409 skip_token t;
410 t.state <- pop_state t;
411 Event.Mapping_end, tok.span
412 | _ ->
413 Error.raise_span tok.span Expected_key
414
415(** Parse block mapping value *)
416let parse_block_mapping_value t =
417 let tok = current_token t in
418 match tok.token with
419 | Token.Value ->
420 skip_token t;
421 if check t (function
422 | Token.Key | Token.Value | Token.Block_end -> true
423 | _ -> false)
424 then begin
425 t.state <- Block_mapping_key;
426 empty_scalar_event ~anchor:None ~tag:None tok.span
427 end else begin
428 push_state t Block_mapping_key;
429 parse_node t ~block:true ~indentless:true
430 end
431 | _ ->
432 (* Implicit empty value *)
433 t.state <- Block_mapping_key;
434 empty_scalar_event ~anchor:None ~tag:None tok.span
435
436(** Parse indentless sequence entry *)
437let parse_indentless_sequence_entry t =
438 let tok = current_token t in
439 match tok.token with
440 | Token.Block_entry ->
441 skip_token t;
442 if check t (function
443 | Token.Block_entry | Token.Key | Token.Value | Token.Block_end -> true
444 | _ -> false)
445 then begin
446 t.state <- Indentless_sequence_entry;
447 empty_scalar_event ~anchor:None ~tag:None tok.span
448 end else begin
449 push_state t Indentless_sequence_entry;
450 parse_node t ~block:true ~indentless:false
451 end
452 | _ ->
453 t.state <- pop_state t;
454 Event.Sequence_end, tok.span
455
456(** Parse flow sequence *)
457let rec parse_flow_sequence_entry t ~first =
458 let tok = current_token t in
459 match tok.token with
460 | Token.Flow_sequence_end ->
461 skip_token t;
462 t.state <- pop_state t;
463 Event.Sequence_end, tok.span
464 | Token.Flow_entry when not first ->
465 skip_token t;
466 parse_flow_sequence_entry_internal t
467 | _ when first ->
468 parse_flow_sequence_entry_internal t
469 | _ ->
470 Error.raise_span tok.span Expected_sequence_end
471
472and parse_flow_sequence_entry_internal t =
473 let tok = current_token t in
474 match tok.token with
475 | Token.Flow_sequence_end ->
476 (* Trailing comma case - don't emit empty scalar, just go back to sequence entry state *)
477 skip_token t;
478 t.state <- pop_state t;
479 Event.Sequence_end, tok.span
480 | Token.Flow_entry ->
481 (* Double comma or comma after comma - invalid *)
482 Error.raise_span tok.span (Unexpected_token "unexpected ',' in flow sequence")
483 | Token.Key ->
484 skip_token t;
485 t.state <- Flow_sequence_entry_mapping_key;
486 Event.Mapping_start {
487 anchor = None; tag = None;
488 implicit = true;
489 style = `Flow;
490 }, tok.span
491 | Token.Value ->
492 (* Implicit empty key mapping: [ : value ] *)
493 t.state <- Flow_sequence_entry_mapping_key;
494 Event.Mapping_start {
495 anchor = None; tag = None;
496 implicit = true;
497 style = `Flow;
498 }, tok.span
499 | _ ->
500 push_state t Flow_sequence_entry;
501 parse_node t ~block:false ~indentless:false
502
503(** Parse flow sequence entry mapping *)
504let parse_flow_sequence_entry_mapping_key t =
505 let tok = current_token t in
506 if check t (function
507 | Token.Value | Token.Flow_entry | Token.Flow_sequence_end -> true
508 | _ -> false)
509 then begin
510 t.state <- Flow_sequence_entry_mapping_value;
511 empty_scalar_event ~anchor:None ~tag:None tok.span
512 end else begin
513 push_state t Flow_sequence_entry_mapping_value;
514 parse_node t ~block:false ~indentless:false
515 end
516
517let parse_flow_sequence_entry_mapping_value t =
518 let tok = current_token t in
519 match tok.token with
520 | Token.Value ->
521 skip_token t;
522 if check t (function
523 | Token.Flow_entry | Token.Flow_sequence_end -> true
524 | _ -> false)
525 then begin
526 t.state <- Flow_sequence_entry_mapping_end;
527 empty_scalar_event ~anchor:None ~tag:None tok.span
528 end else begin
529 push_state t Flow_sequence_entry_mapping_end;
530 parse_node t ~block:false ~indentless:false
531 end
532 | _ ->
533 t.state <- Flow_sequence_entry_mapping_end;
534 empty_scalar_event ~anchor:None ~tag:None tok.span
535
536let parse_flow_sequence_entry_mapping_end t =
537 let tok = current_token t in
538 t.state <- Flow_sequence_entry;
539 Event.Mapping_end, tok.span
540
541(** Parse flow mapping *)
542let rec parse_flow_mapping_key t ~first =
543 let tok = current_token t in
544 match tok.token with
545 | Token.Flow_mapping_end ->
546 skip_token t;
547 t.state <- pop_state t;
548 Event.Mapping_end, tok.span
549 | Token.Flow_entry when not first ->
550 skip_token t;
551 parse_flow_mapping_key_internal t
552 | _ when first ->
553 parse_flow_mapping_key_internal t
554 | _ ->
555 Error.raise_span tok.span Expected_mapping_end
556
557and parse_flow_mapping_key_internal t =
558 let tok = current_token t in
559 match tok.token with
560 | Token.Flow_mapping_end ->
561 (* Trailing comma case - don't emit empty scalar, just return to key state *)
562 skip_token t;
563 t.state <- pop_state t;
564 Event.Mapping_end, tok.span
565 | Token.Flow_entry ->
566 (* Double comma or comma after comma - invalid *)
567 Error.raise_span tok.span (Unexpected_token "unexpected ',' in flow mapping")
568 | Token.Key ->
569 skip_token t;
570 if check t (function
571 | Token.Value | Token.Flow_entry | Token.Flow_mapping_end -> true
572 | _ -> false)
573 then begin
574 t.state <- Flow_mapping_value;
575 empty_scalar_event ~anchor:None ~tag:None tok.span
576 end else begin
577 push_state t Flow_mapping_value;
578 parse_node t ~block:false ~indentless:false
579 end
580 | _ ->
581 push_state t Flow_mapping_value;
582 parse_node t ~block:false ~indentless:false
583
584let parse_flow_mapping_value t ~empty =
585 let tok = current_token t in
586 if empty then begin
587 t.state <- Flow_mapping_key;
588 empty_scalar_event ~anchor:None ~tag:None tok.span
589 end else
590 match tok.token with
591 | Token.Value ->
592 skip_token t;
593 if check t (function
594 | Token.Flow_entry | Token.Flow_mapping_end -> true
595 | _ -> false)
596 then begin
597 t.state <- Flow_mapping_key;
598 empty_scalar_event ~anchor:None ~tag:None tok.span
599 end else begin
600 push_state t Flow_mapping_key;
601 parse_node t ~block:false ~indentless:false
602 end
603 | _ ->
604 t.state <- Flow_mapping_key;
605 empty_scalar_event ~anchor:None ~tag:None tok.span
606
607(** Main state machine dispatcher *)
608let rec parse t =
609 match t.state with
610 | Stream_start ->
611 parse_stream_start t
612
613 | Implicit_document_start ->
614 (* Skip any document end markers before checking what's next *)
615 while check t (function Token.Document_end -> true | _ -> false) do
616 t.explicit_doc_end <- true; (* Seeing ... counts as explicit end *)
617 skip_token t
618 done;
619
620 let tok = current_token t in
621 (match tok.token with
622 | Token.Stream_end ->
623 skip_token t;
624 t.state <- End;
625 t.finished <- true;
626 Event.Stream_end, tok.span
627 | Token.Version_directive _ | Token.Tag_directive _ ->
628 (* Directives are only allowed at stream start or after explicit ... (MUS6/01) *)
629 if not t.stream_start && not t.explicit_doc_end then
630 Error.raise_span tok.span (Invalid_directive "directives require explicit document end '...' before them");
631 parse_document_start t ~implicit:false
632 | Token.Document_start ->
633 parse_document_start t ~implicit:false
634 (* These tokens are invalid at document start - they indicate leftover junk *)
635 | Token.Flow_sequence_end | Token.Flow_mapping_end | Token.Flow_entry
636 | Token.Block_end | Token.Value ->
637 Error.raise_span tok.span (Unexpected_token "unexpected token at document start")
638 | _ ->
639 parse_document_start t ~implicit:true)
640
641 | Document_content ->
642 if check t (function
643 | Token.Version_directive _ | Token.Tag_directive _
644 | Token.Document_start | Token.Document_end | Token.Stream_end -> true
645 | _ -> false)
646 then begin
647 let tok = current_token t in
648 t.state <- pop_state t;
649 empty_scalar_event ~anchor:None ~tag:None tok.span
650 end else begin
651 (* Push Document_content_done so we return there after parsing the node.
652 This allows us to check for unexpected content after the node. *)
653 push_state t Document_content_done;
654 parse_node t ~block:true ~indentless:false
655 end
656
657 | Document_content_done ->
658 (* After parsing a node in document content, check for unexpected content *)
659 if check t (function
660 | Token.Version_directive _ | Token.Tag_directive _
661 | Token.Document_start | Token.Document_end | Token.Stream_end -> true
662 | _ -> false)
663 then begin
664 (* Valid document boundary - continue to Document_end *)
665 t.state <- pop_state t;
666 parse t (* Continue to emit the next event *)
667 end else begin
668 (* Unexpected content after document value - this is an error (KS4U, BS4K) *)
669 let tok = current_token t in
670 Error.raise_span tok.span
671 (Unexpected_token "content not allowed after document value")
672 end
673
674 | Document_end ->
675 parse_document_end t
676
677 | Block_sequence_first_entry ->
678 t.state <- Block_sequence_entry;
679 parse_block_sequence_entry t
680
681 | Block_sequence_entry ->
682 parse_block_sequence_entry t
683
684 | Indentless_sequence_entry ->
685 parse_indentless_sequence_entry t
686
687 | Block_mapping_first_key ->
688 t.state <- Block_mapping_key;
689 parse_block_mapping_key t
690
691 | Block_mapping_key ->
692 parse_block_mapping_key t
693
694 | Block_mapping_value ->
695 parse_block_mapping_value t
696
697 | Flow_sequence_first_entry ->
698 parse_flow_sequence_entry t ~first:true
699
700 | Flow_sequence_entry ->
701 parse_flow_sequence_entry t ~first:false
702
703 | Flow_sequence_entry_mapping_key ->
704 parse_flow_sequence_entry_mapping_key t
705
706 | Flow_sequence_entry_mapping_value ->
707 parse_flow_sequence_entry_mapping_value t
708
709 | Flow_sequence_entry_mapping_end ->
710 parse_flow_sequence_entry_mapping_end t
711
712 | Flow_mapping_first_key ->
713 parse_flow_mapping_key t ~first:true
714
715 | Flow_mapping_key ->
716 parse_flow_mapping_key t ~first:false
717
718 | Flow_mapping_value ->
719 parse_flow_mapping_value t ~empty:false
720
721 | End ->
722 let span = Span.point Position.initial in
723 t.finished <- true;
724 Event.Stream_end, span
725
726(** Get next event *)
727let next t =
728 if t.finished then None
729 else
730 let event, span = parse t in
731 Some { Event.event; span }
732
733(** Iterate over all events *)
734let iter f t =
735 let rec loop () =
736 match next t with
737 | None -> ()
738 | Some ev -> f ev; loop ()
739 in
740 loop ()
741
742(** Fold over all events *)
743let fold f init t =
744 let rec loop acc =
745 match next t with
746 | None -> acc
747 | Some ev -> loop (f acc ev)
748 in
749 loop init
750
751(** Convert to list *)
752let to_list t =
753 fold (fun acc ev -> ev :: acc) [] t |> List.rev