Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** YAML parser - converts tokens to semantic events via state machine *)
7
8(** Parser states *)
9type state =
10 | Stream_start
11 | Implicit_document_start
12 | Document_content
13 | Document_content_done
14 (* After parsing a node, check for unexpected content *)
15 | Document_end
16 | Block_sequence_first_entry
17 | Block_sequence_entry
18 | Indentless_sequence_entry
19 | Block_mapping_first_key
20 | Block_mapping_key
21 | Block_mapping_value
22 | Flow_sequence_first_entry
23 | Flow_sequence_entry
24 | Flow_sequence_entry_mapping_key
25 | Flow_sequence_entry_mapping_value
26 | Flow_sequence_entry_mapping_end
27 | Flow_mapping_first_key
28 | Flow_mapping_key
29 | Flow_mapping_value
30 | End
31
32type t = {
33 scanner : Scanner.t;
34 mutable state : state;
35 mutable states : state list; (** State stack *)
36 mutable version : (int * int) option;
37 mutable tag_directives : (string * string) list;
38 mutable current_token : Token.spanned option;
39 mutable finished : bool;
40 mutable explicit_doc_end : bool;
41 (** True if last doc ended with explicit ... *)
42 mutable stream_start : bool;
43 (** True if we haven't emitted any documents yet *)
44}
45
46let create scanner =
47 {
48 scanner;
49 state = Stream_start;
50 states = [];
51 version = None;
52 tag_directives = [ ("!", "!"); ("!!", "tag:yaml.org,2002:") ];
53 current_token = None;
54 finished = false;
55 explicit_doc_end = false;
56 stream_start = true;
57 }
58
59let of_string s = create (Scanner.of_string s)
60let of_scanner = create
61let of_input i = create (Scanner.of_input i)
62let of_reader r = create (Scanner.of_reader r)
63
64(** Get current token, fetching if needed *)
65let current_token t =
66 match t.current_token with
67 | Some tok -> tok
68 | None -> (
69 let tok = Scanner.next t.scanner in
70 t.current_token <- tok;
71 match tok with Some tok -> tok | None -> Error.raise Unexpected_eof)
72
73(** Peek at current token *)
74let peek_token t =
75 match t.current_token with
76 | Some _ -> t.current_token
77 | None ->
78 t.current_token <- Scanner.next t.scanner;
79 t.current_token
80
81(** Skip current token *)
82let skip_token t = t.current_token <- None
83
84(** Check if current token matches predicate *)
85let check t pred =
86 match peek_token t with Some tok -> pred tok.token | None -> false
87
88
89(** Push state onto stack *)
90let push_state t s = t.states <- s :: t.states
91
92(** Pop state from stack *)
93let pop_state t =
94 match t.states with
95 | s :: rest ->
96 t.states <- rest;
97 s
98 | [] -> End
99
100(** Resolve a tag *)
101let resolve_tag t ~handle ~suffix =
102 if handle = "" then
103 (* Verbatim tag - suffix is already the full URI *)
104 suffix
105 else
106 match List.assoc_opt handle t.tag_directives with
107 | Some prefix -> prefix ^ suffix
108 | None when handle = "!" -> "!" ^ suffix
109 | None -> Error.raise (Invalid_tag (handle ^ suffix))
110
111(** Process directives at document start *)
112let process_directives t =
113 t.version <- None;
114 t.tag_directives <- [ ("!", "!"); ("!!", "tag:yaml.org,2002:") ];
115
116 while
117 check t (function
118 | Token.Version_directive _ | Token.Tag_directive _ -> true
119 | _ -> false)
120 do
121 let tok = current_token t in
122 skip_token t;
123 match tok.token with
124 | Token.Version_directive { major; minor } ->
125 if t.version <> None then
126 Error.raise_span tok.span
127 (Invalid_yaml_version "duplicate YAML directive");
128 t.version <- Some (major, minor)
129 | Token.Tag_directive { handle; prefix } ->
130 (* Skip empty tag directives (these are reserved/unknown directives that were ignored) *)
131 if handle = "" && prefix = "" then () (* Ignore reserved directives *)
132 else begin
133 if
134 List.mem_assoc handle t.tag_directives
135 && handle <> "!" && handle <> "!!"
136 then
137 Error.raise_span tok.span
138 (Invalid_tag_directive ("duplicate tag handle: " ^ handle));
139 t.tag_directives <- (handle, prefix) :: t.tag_directives
140 end
141 | _ -> ()
142 done
143
144(** Parse anchor and/or tag properties *)
145let parse_properties t =
146 let anchor = ref None in
147 let tag = ref None in
148
149 while
150 check t (function Token.Anchor _ | Token.Tag _ -> true | _ -> false)
151 do
152 let tok = current_token t in
153 skip_token t;
154 match tok.token with
155 | Token.Anchor name ->
156 if Option.is_some !anchor then
157 Error.raise_span tok.span (Duplicate_anchor name);
158 anchor := Some name
159 | Token.Tag { handle; suffix } ->
160 if Option.is_some !tag then
161 Error.raise_span tok.span (Invalid_tag "duplicate tag");
162 let resolved =
163 if handle = "" && suffix = "" then None
164 else if handle = "!" && suffix = "" then Some "!"
165 else Some (resolve_tag t ~handle ~suffix)
166 in
167 tag := resolved
168 | _ -> ()
169 done;
170 (!anchor, !tag)
171
172(** Empty scalar event *)
173let empty_scalar_event ~anchor ~tag span =
174 ( Event.Scalar
175 {
176 anchor;
177 tag;
178 value = "";
179 plain_implicit = tag = None;
180 quoted_implicit = false;
181 style = `Plain;
182 },
183 span )
184
185(** Parse stream start *)
186let parse_stream_start t =
187 let tok = current_token t in
188 skip_token t;
189 match tok.token with
190 | Token.Stream_start encoding ->
191 t.state <- Implicit_document_start;
192 (Event.Stream_start { encoding }, tok.span)
193 | _ -> Error.raise_span tok.span (Unexpected_token "expected stream start")
194
195(** Parse document start (implicit or explicit) *)
196let parse_document_start t ~implicit =
197 process_directives t;
198
199 if not implicit then begin
200 let tok = current_token t in
201 match tok.token with
202 | Token.Document_start -> skip_token t
203 | _ -> Error.raise_span tok.span Expected_document_start
204 end;
205
206 let span =
207 match peek_token t with
208 | Some tok -> tok.span
209 | None -> Span.point Position.initial
210 in
211
212 (* After first document, stream_start is false *)
213 t.stream_start <- false;
214 push_state t Document_end;
215 t.state <- Document_content;
216 (Event.Document_start { version = t.version; implicit }, span)
217
218(** Parse document end *)
219let parse_document_end t =
220 let implicit =
221 not (check t (function Token.Document_end -> true | _ -> false))
222 in
223 let span =
224 match peek_token t with
225 | Some tok -> tok.span
226 | None -> Span.point Position.initial
227 in
228
229 if not implicit then skip_token t;
230
231 (* Track if this document ended explicitly with ... *)
232 t.explicit_doc_end <- not implicit;
233 t.state <- Implicit_document_start;
234 (Event.Document_end { implicit }, span)
235
236(** Parse node in various contexts *)
237let parse_node t ~block ~indentless =
238 let tok = current_token t in
239 match tok.token with
240 | Token.Alias name ->
241 skip_token t;
242 t.state <- pop_state t;
243 (Event.Alias { anchor = name }, tok.span)
244 | Token.Anchor _ | Token.Tag _ -> (
245 let anchor, tag = parse_properties t in
246 let tok = current_token t in
247 match tok.token with
248 | Token.Block_entry when indentless ->
249 t.state <- Indentless_sequence_entry;
250 ( Event.Sequence_start
251 { anchor; tag; implicit = tag = None; style = `Block },
252 tok.span )
253 | Token.Block_sequence_start when block ->
254 t.state <- Block_sequence_first_entry;
255 skip_token t;
256 ( Event.Sequence_start
257 { anchor; tag; implicit = tag = None; style = `Block },
258 tok.span )
259 | Token.Block_mapping_start when block ->
260 t.state <- Block_mapping_first_key;
261 skip_token t;
262 ( Event.Mapping_start
263 { anchor; tag; implicit = tag = None; style = `Block },
264 tok.span )
265 | Token.Flow_sequence_start ->
266 t.state <- Flow_sequence_first_entry;
267 skip_token t;
268 ( Event.Sequence_start
269 { anchor; tag; implicit = tag = None; style = `Flow },
270 tok.span )
271 | Token.Flow_mapping_start ->
272 t.state <- Flow_mapping_first_key;
273 skip_token t;
274 ( Event.Mapping_start
275 { anchor; tag; implicit = tag = None; style = `Flow },
276 tok.span )
277 | Token.Scalar { style; value } ->
278 skip_token t;
279 t.state <- pop_state t;
280 let plain_implicit = tag = None && style = `Plain in
281 let quoted_implicit = tag = None && style <> `Plain in
282 ( Event.Scalar
283 { anchor; tag; value; plain_implicit; quoted_implicit; style },
284 tok.span )
285 | _ ->
286 (* Empty node *)
287 t.state <- pop_state t;
288 empty_scalar_event ~anchor ~tag tok.span)
289 | Token.Block_sequence_start when block ->
290 t.state <- Block_sequence_first_entry;
291 skip_token t;
292 ( Event.Sequence_start
293 { anchor = None; tag = None; implicit = true; style = `Block },
294 tok.span )
295 | Token.Block_mapping_start when block ->
296 t.state <- Block_mapping_first_key;
297 skip_token t;
298 ( Event.Mapping_start
299 { anchor = None; tag = None; implicit = true; style = `Block },
300 tok.span )
301 | Token.Flow_sequence_start ->
302 t.state <- Flow_sequence_first_entry;
303 skip_token t;
304 ( Event.Sequence_start
305 { anchor = None; tag = None; implicit = true; style = `Flow },
306 tok.span )
307 | Token.Flow_mapping_start ->
308 t.state <- Flow_mapping_first_key;
309 skip_token t;
310 ( Event.Mapping_start
311 { anchor = None; tag = None; implicit = true; style = `Flow },
312 tok.span )
313 | Token.Block_entry when indentless ->
314 t.state <- Indentless_sequence_entry;
315 ( Event.Sequence_start
316 { anchor = None; tag = None; implicit = true; style = `Block },
317 tok.span )
318 | Token.Scalar { style; value } ->
319 skip_token t;
320 t.state <- pop_state t;
321 let plain_implicit = style = `Plain in
322 let quoted_implicit = style <> `Plain in
323 ( Event.Scalar
324 {
325 anchor = None;
326 tag = None;
327 value;
328 plain_implicit;
329 quoted_implicit;
330 style;
331 },
332 tok.span )
333 | _ ->
334 (* Empty node *)
335 t.state <- pop_state t;
336 empty_scalar_event ~anchor:None ~tag:None tok.span
337
338(** Parse block sequence entry *)
339let parse_block_sequence_entry t =
340 let tok = current_token t in
341 match tok.token with
342 | Token.Block_entry ->
343 skip_token t;
344 if
345 check t (function
346 | Token.Block_entry | Token.Block_end -> true
347 | _ -> false)
348 then begin
349 t.state <- Block_sequence_entry;
350 empty_scalar_event ~anchor:None ~tag:None tok.span
351 end
352 else begin
353 push_state t Block_sequence_entry;
354 parse_node t ~block:true ~indentless:false
355 end
356 | Token.Block_end ->
357 skip_token t;
358 t.state <- pop_state t;
359 (Event.Sequence_end, tok.span)
360 | _ -> Error.raise_span tok.span Expected_block_entry
361
362(** Parse block mapping key *)
363let parse_block_mapping_key t =
364 let tok = current_token t in
365 match tok.token with
366 | Token.Key ->
367 skip_token t;
368 if
369 check t (function
370 | Token.Key | Token.Value | Token.Block_end -> true
371 | _ -> false)
372 then begin
373 t.state <- Block_mapping_value;
374 empty_scalar_event ~anchor:None ~tag:None tok.span
375 end
376 else begin
377 push_state t Block_mapping_value;
378 parse_node t ~block:true ~indentless:true
379 end
380 (* Handle value without explicit key - key is empty/null *)
381 | Token.Value ->
382 t.state <- Block_mapping_value;
383 empty_scalar_event ~anchor:None ~tag:None tok.span
384 | Token.Block_end ->
385 skip_token t;
386 t.state <- pop_state t;
387 (Event.Mapping_end, tok.span)
388 | _ -> Error.raise_span tok.span Expected_key
389
390(** Parse block mapping value *)
391let parse_block_mapping_value t =
392 let tok = current_token t in
393 match tok.token with
394 | Token.Value ->
395 skip_token t;
396 if
397 check t (function
398 | Token.Key | Token.Value | Token.Block_end -> true
399 | _ -> false)
400 then begin
401 t.state <- Block_mapping_key;
402 empty_scalar_event ~anchor:None ~tag:None tok.span
403 end
404 else begin
405 push_state t Block_mapping_key;
406 parse_node t ~block:true ~indentless:true
407 end
408 | _ ->
409 (* Implicit empty value *)
410 t.state <- Block_mapping_key;
411 empty_scalar_event ~anchor:None ~tag:None tok.span
412
413(** Parse indentless sequence entry *)
414let parse_indentless_sequence_entry t =
415 let tok = current_token t in
416 match tok.token with
417 | Token.Block_entry ->
418 skip_token t;
419 if
420 check t (function
421 | Token.Block_entry | Token.Key | Token.Value | Token.Block_end ->
422 true
423 | _ -> false)
424 then begin
425 t.state <- Indentless_sequence_entry;
426 empty_scalar_event ~anchor:None ~tag:None tok.span
427 end
428 else begin
429 push_state t Indentless_sequence_entry;
430 parse_node t ~block:true ~indentless:false
431 end
432 | _ ->
433 t.state <- pop_state t;
434 (Event.Sequence_end, tok.span)
435
436(** Parse flow sequence *)
437let rec parse_flow_sequence_entry t ~first =
438 let tok = current_token t in
439 match tok.token with
440 | Token.Flow_sequence_end ->
441 skip_token t;
442 t.state <- pop_state t;
443 (Event.Sequence_end, tok.span)
444 | Token.Flow_entry when not first ->
445 skip_token t;
446 parse_flow_sequence_entry_internal t
447 | _ when first -> parse_flow_sequence_entry_internal t
448 | _ -> Error.raise_span tok.span Expected_sequence_end
449
450and parse_flow_sequence_entry_internal t =
451 let tok = current_token t in
452 match tok.token with
453 | Token.Flow_sequence_end ->
454 (* Trailing comma case - don't emit empty scalar, just go back to sequence entry state *)
455 skip_token t;
456 t.state <- pop_state t;
457 (Event.Sequence_end, tok.span)
458 | Token.Flow_entry ->
459 (* Double comma or comma after comma - invalid *)
460 Error.raise_span tok.span
461 (Unexpected_token "unexpected ',' in flow sequence")
462 | Token.Key ->
463 skip_token t;
464 t.state <- Flow_sequence_entry_mapping_key;
465 ( Event.Mapping_start
466 { anchor = None; tag = None; implicit = true; style = `Flow },
467 tok.span )
468 | Token.Value ->
469 (* Implicit empty key mapping: [ : value ] *)
470 t.state <- Flow_sequence_entry_mapping_key;
471 ( Event.Mapping_start
472 { anchor = None; tag = None; implicit = true; style = `Flow },
473 tok.span )
474 | _ ->
475 push_state t Flow_sequence_entry;
476 parse_node t ~block:false ~indentless:false
477
478(** Parse flow sequence entry mapping *)
479let parse_flow_sequence_entry_mapping_key t =
480 let tok = current_token t in
481 if
482 check t (function
483 | Token.Value | Token.Flow_entry | Token.Flow_sequence_end -> true
484 | _ -> false)
485 then begin
486 t.state <- Flow_sequence_entry_mapping_value;
487 empty_scalar_event ~anchor:None ~tag:None tok.span
488 end
489 else begin
490 push_state t Flow_sequence_entry_mapping_value;
491 parse_node t ~block:false ~indentless:false
492 end
493
494let parse_flow_sequence_entry_mapping_value t =
495 let tok = current_token t in
496 match tok.token with
497 | Token.Value ->
498 skip_token t;
499 if
500 check t (function
501 | Token.Flow_entry | Token.Flow_sequence_end -> true
502 | _ -> false)
503 then begin
504 t.state <- Flow_sequence_entry_mapping_end;
505 empty_scalar_event ~anchor:None ~tag:None tok.span
506 end
507 else begin
508 push_state t Flow_sequence_entry_mapping_end;
509 parse_node t ~block:false ~indentless:false
510 end
511 | _ ->
512 t.state <- Flow_sequence_entry_mapping_end;
513 empty_scalar_event ~anchor:None ~tag:None tok.span
514
515let parse_flow_sequence_entry_mapping_end t =
516 let tok = current_token t in
517 t.state <- Flow_sequence_entry;
518 (Event.Mapping_end, tok.span)
519
520(** Parse flow mapping *)
521let rec parse_flow_mapping_key t ~first =
522 let tok = current_token t in
523 match tok.token with
524 | Token.Flow_mapping_end ->
525 skip_token t;
526 t.state <- pop_state t;
527 (Event.Mapping_end, tok.span)
528 | Token.Flow_entry when not first ->
529 skip_token t;
530 parse_flow_mapping_key_internal t
531 | _ when first -> parse_flow_mapping_key_internal t
532 | _ -> Error.raise_span tok.span Expected_mapping_end
533
534and parse_flow_mapping_key_internal t =
535 let tok = current_token t in
536 match tok.token with
537 | Token.Flow_mapping_end ->
538 (* Trailing comma case - don't emit empty scalar, just return to key state *)
539 skip_token t;
540 t.state <- pop_state t;
541 (Event.Mapping_end, tok.span)
542 | Token.Flow_entry ->
543 (* Double comma or comma after comma - invalid *)
544 Error.raise_span tok.span
545 (Unexpected_token "unexpected ',' in flow mapping")
546 | Token.Key ->
547 skip_token t;
548 if
549 check t (function
550 | Token.Value | Token.Flow_entry | Token.Flow_mapping_end -> true
551 | _ -> false)
552 then begin
553 t.state <- Flow_mapping_value;
554 empty_scalar_event ~anchor:None ~tag:None tok.span
555 end
556 else begin
557 push_state t Flow_mapping_value;
558 parse_node t ~block:false ~indentless:false
559 end
560 | _ ->
561 push_state t Flow_mapping_value;
562 parse_node t ~block:false ~indentless:false
563
564let parse_flow_mapping_value t ~empty =
565 let tok = current_token t in
566 if empty then begin
567 t.state <- Flow_mapping_key;
568 empty_scalar_event ~anchor:None ~tag:None tok.span
569 end
570 else
571 match tok.token with
572 | Token.Value ->
573 skip_token t;
574 if
575 check t (function
576 | Token.Flow_entry | Token.Flow_mapping_end -> true
577 | _ -> false)
578 then begin
579 t.state <- Flow_mapping_key;
580 empty_scalar_event ~anchor:None ~tag:None tok.span
581 end
582 else begin
583 push_state t Flow_mapping_key;
584 parse_node t ~block:false ~indentless:false
585 end
586 | _ ->
587 t.state <- Flow_mapping_key;
588 empty_scalar_event ~anchor:None ~tag:None tok.span
589
590(** Main state machine dispatcher *)
591let rec parse t =
592 match t.state with
593 | Stream_start -> parse_stream_start t
594 | Implicit_document_start -> (
595 (* Skip any document end markers before checking what's next *)
596 while check t (function Token.Document_end -> true | _ -> false) do
597 t.explicit_doc_end <- true;
598 (* Seeing ... counts as explicit end *)
599 skip_token t
600 done;
601
602 let tok = current_token t in
603 match tok.token with
604 | Token.Stream_end ->
605 skip_token t;
606 t.state <- End;
607 t.finished <- true;
608 (Event.Stream_end, tok.span)
609 | Token.Version_directive _ | Token.Tag_directive _ ->
610 (* Directives are only allowed at stream start or after explicit ... (MUS6/01) *)
611 if (not t.stream_start) && not t.explicit_doc_end then
612 Error.raise_span tok.span
613 (Invalid_directive
614 "directives require explicit document end '...' before them");
615 parse_document_start t ~implicit:false
616 | Token.Document_start -> parse_document_start t ~implicit:false
617 (* These tokens are invalid at document start - they indicate leftover junk *)
618 | Token.Flow_sequence_end | Token.Flow_mapping_end | Token.Flow_entry
619 | Token.Block_end | Token.Value ->
620 Error.raise_span tok.span
621 (Unexpected_token "unexpected token at document start")
622 | _ -> parse_document_start t ~implicit:true)
623
624 | Document_content ->
625 if
626 check t (function
627 | Token.Version_directive _ | Token.Tag_directive _
628 | Token.Document_start | Token.Document_end | Token.Stream_end ->
629 true
630 | _ -> false)
631 then begin
632 let tok = current_token t in
633 t.state <- pop_state t;
634 empty_scalar_event ~anchor:None ~tag:None tok.span
635 end
636 else begin
637 (* Push Document_content_done so we return there after parsing the node.
638 This allows us to check for unexpected content after the node. *)
639 push_state t Document_content_done;
640 parse_node t ~block:true ~indentless:false
641 end
642 | Document_content_done ->
643 (* After parsing a node in document content, check for unexpected content *)
644 if
645 check t (function
646 | Token.Version_directive _ | Token.Tag_directive _
647 | Token.Document_start | Token.Document_end | Token.Stream_end ->
648 true
649 | _ -> false)
650 then begin
651 (* Valid document boundary - continue to Document_end *)
652 t.state <- pop_state t;
653 parse t (* Continue to emit the next event *)
654 end
655 else begin
656 (* Unexpected content after document value - this is an error (KS4U, BS4K) *)
657 let tok = current_token t in
658 Error.raise_span tok.span
659 (Unexpected_token "content not allowed after document value")
660 end
661 | Document_end -> parse_document_end t
662
663 | Block_sequence_first_entry ->
664 t.state <- Block_sequence_entry;
665 parse_block_sequence_entry t
666 | Block_sequence_entry -> parse_block_sequence_entry t
667 | Indentless_sequence_entry -> parse_indentless_sequence_entry t
668 | Block_mapping_first_key ->
669 t.state <- Block_mapping_key;
670 parse_block_mapping_key t
671 | Block_mapping_key -> parse_block_mapping_key t
672 | Block_mapping_value -> parse_block_mapping_value t
673 | Flow_sequence_first_entry -> parse_flow_sequence_entry t ~first:true
674 | Flow_sequence_entry -> parse_flow_sequence_entry t ~first:false
675 | Flow_sequence_entry_mapping_key -> parse_flow_sequence_entry_mapping_key t
676 | Flow_sequence_entry_mapping_value ->
677 parse_flow_sequence_entry_mapping_value t
678 | Flow_sequence_entry_mapping_end -> parse_flow_sequence_entry_mapping_end t
679 | Flow_mapping_first_key -> parse_flow_mapping_key t ~first:true
680 | Flow_mapping_key -> parse_flow_mapping_key t ~first:false
681 | Flow_mapping_value -> parse_flow_mapping_value t ~empty:false
682
683 | End ->
684 let span = Span.point Position.initial in
685 t.finished <- true;
686 (Event.Stream_end, span)
687
688(** Get next event *)
689let next t =
690 if t.finished then None
691 else
692 let event, span = parse t in
693 Some { Event.event; span }
694
695(** Iterate over all events *)
696let iter f t =
697 let rec loop () =
698 match next t with
699 | None -> ()
700 | Some ev ->
701 f ev;
702 loop ()
703 in
704 loop ()
705
706(** Fold over all events *)
707let fold f init t =
708 let rec loop acc =
709 match next t with None -> acc | Some ev -> loop (f acc ev)
710 in
711 loop init
712
713(** Convert to list *)
714let to_list t = fold (fun acc ev -> ev :: acc) [] t |> List.rev