Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** YAML parser - converts tokens to semantic events via state machine *) 7 8(** Parser states *) 9type state = 10 | Stream_start 11 | Implicit_document_start 12 | Document_content 13 | Document_content_done (* After parsing a node, check for unexpected content *) 14 | Document_end 15 | Block_sequence_first_entry 16 | Block_sequence_entry 17 | Indentless_sequence_entry 18 | Block_mapping_first_key 19 | Block_mapping_key 20 | Block_mapping_value 21 | Flow_sequence_first_entry 22 | Flow_sequence_entry 23 | Flow_sequence_entry_mapping_key 24 | Flow_sequence_entry_mapping_value 25 | Flow_sequence_entry_mapping_end 26 | Flow_mapping_first_key 27 | Flow_mapping_key 28 | Flow_mapping_value 29 | End 30 31type t = { 32 scanner : Scanner.t; 33 mutable state : state; 34 mutable states : state list; (** State stack *) 35 mutable version : (int * int) option; 36 mutable tag_directives : (string * string) list; 37 mutable current_token : Token.spanned option; 38 mutable finished : bool; 39 mutable explicit_doc_end : bool; (** True if last doc ended with explicit ... *) 40 mutable stream_start : bool; (** True if we haven't emitted any documents yet *) 41} 42 43let create scanner = { 44 scanner; 45 state = Stream_start; 46 states = []; 47 version = None; 48 tag_directives = [ 49 ("!", "!"); 50 ("!!", "tag:yaml.org,2002:"); 51 ]; 52 current_token = None; 53 finished = false; 54 explicit_doc_end = false; 55 stream_start = true; 56} 57 58let of_string s = create (Scanner.of_string s) 59let of_scanner = create 60let of_input i = create (Scanner.of_input i) 61let of_reader r = create (Scanner.of_reader r) 62 63(** Get current token, fetching if needed *) 64let current_token t = 65 match t.current_token with 66 | Some tok -> tok 67 | None -> 68 let tok = Scanner.next t.scanner in 69 t.current_token <- tok; 70 match tok with 71 | Some tok -> tok 72 | None -> Error.raise Unexpected_eof 73 74(** Peek at current token *) 75let peek_token t = 76 match t.current_token with 77 | Some _ -> t.current_token 78 | None -> 79 t.current_token <- Scanner.next t.scanner; 80 t.current_token 81 82(** Skip current token *) 83let skip_token t = 84 t.current_token <- None 85 86(** Check if current token matches predicate *) 87let check t pred = 88 match peek_token t with 89 | Some tok -> pred tok.token 90 | None -> false 91 92(** Push state onto stack *) 93let push_state t s = 94 t.states <- s :: t.states 95 96(** Pop state from stack *) 97let pop_state t = 98 match t.states with 99 | s :: rest -> 100 t.states <- rest; 101 s 102 | [] -> End 103 104(** Resolve a tag *) 105let resolve_tag t ~handle ~suffix = 106 if handle = "" then 107 (* Verbatim tag - suffix is already the full URI *) 108 suffix 109 else 110 match List.assoc_opt handle t.tag_directives with 111 | Some prefix -> prefix ^ suffix 112 | None when handle = "!" -> "!" ^ suffix 113 | None -> Error.raise (Invalid_tag (handle ^ suffix)) 114 115(** Process directives at document start *) 116let process_directives t = 117 t.version <- None; 118 t.tag_directives <- [("!", "!"); ("!!", "tag:yaml.org,2002:")]; 119 120 while check t (function 121 | Token.Version_directive _ | Token.Tag_directive _ -> true 122 | _ -> false) 123 do 124 let tok = current_token t in 125 skip_token t; 126 match tok.token with 127 | Token.Version_directive { major; minor } -> 128 if t.version <> None then 129 Error.raise_span tok.span (Invalid_yaml_version "duplicate YAML directive"); 130 t.version <- Some (major, minor) 131 | Token.Tag_directive { handle; prefix } -> 132 (* Skip empty tag directives (these are reserved/unknown directives that were ignored) *) 133 if handle = "" && prefix = "" then 134 () (* Ignore reserved directives *) 135 else begin 136 if List.mem_assoc handle t.tag_directives && 137 handle <> "!" && handle <> "!!" then 138 Error.raise_span tok.span (Invalid_tag_directive ("duplicate tag handle: " ^ handle)); 139 t.tag_directives <- (handle, prefix) :: t.tag_directives 140 end 141 | _ -> () 142 done 143 144(** Parse anchor and/or tag properties *) 145let parse_properties t = 146 let anchor = ref None in 147 let tag = ref None in 148 149 while check t (function 150 | Token.Anchor _ | Token.Tag _ -> true 151 | _ -> false) 152 do 153 let tok = current_token t in 154 skip_token t; 155 match tok.token with 156 | Token.Anchor name -> 157 if Option.is_some !anchor then 158 Error.raise_span tok.span (Duplicate_anchor name); 159 anchor := Some name 160 | Token.Tag { handle; suffix } -> 161 if Option.is_some !tag then 162 Error.raise_span tok.span (Invalid_tag "duplicate tag"); 163 let resolved = 164 if handle = "" && suffix = "" then None 165 else if handle = "!" && suffix = "" then Some "!" 166 else Some (resolve_tag t ~handle ~suffix) 167 in 168 tag := resolved 169 | _ -> () 170 done; 171 (!anchor, !tag) 172 173(** Empty scalar event *) 174let empty_scalar_event ~anchor ~tag span = 175 Event.Scalar { 176 anchor; 177 tag; 178 value = ""; 179 plain_implicit = tag = None; 180 quoted_implicit = false; 181 style = `Plain; 182 }, span 183 184(** Parse stream start *) 185let parse_stream_start t = 186 let tok = current_token t in 187 skip_token t; 188 match tok.token with 189 | Token.Stream_start encoding -> 190 t.state <- Implicit_document_start; 191 Event.Stream_start { encoding }, tok.span 192 | _ -> 193 Error.raise_span tok.span (Unexpected_token "expected stream start") 194 195(** Parse document start (implicit or explicit) *) 196let parse_document_start t ~implicit = 197 process_directives t; 198 199 if not implicit then begin 200 let tok = current_token t in 201 match tok.token with 202 | Token.Document_start -> 203 skip_token t 204 | _ -> 205 Error.raise_span tok.span Expected_document_start 206 end; 207 208 let span = match peek_token t with 209 | Some tok -> tok.span 210 | None -> Span.point Position.initial 211 in 212 213 (* After first document, stream_start is false *) 214 t.stream_start <- false; 215 push_state t Document_end; 216 t.state <- Document_content; 217 Event.Document_start { version = t.version; implicit }, span 218 219(** Parse document end *) 220let parse_document_end t = 221 let implicit = not (check t (function Token.Document_end -> true | _ -> false)) in 222 let span = match peek_token t with 223 | Some tok -> tok.span 224 | None -> Span.point Position.initial 225 in 226 227 if not implicit then skip_token t; 228 229 (* Track if this document ended explicitly with ... *) 230 t.explicit_doc_end <- not implicit; 231 t.state <- Implicit_document_start; 232 Event.Document_end { implicit }, span 233 234(** Parse node in various contexts *) 235let parse_node t ~block ~indentless = 236 let tok = current_token t in 237 match tok.token with 238 | Token.Alias name -> 239 skip_token t; 240 t.state <- pop_state t; 241 Event.Alias { anchor = name }, tok.span 242 243 | Token.Anchor _ | Token.Tag _ -> 244 let anchor, tag = parse_properties t in 245 let tok = current_token t in 246 (match tok.token with 247 | Token.Block_entry when indentless -> 248 t.state <- Indentless_sequence_entry; 249 Event.Sequence_start { 250 anchor; tag; 251 implicit = tag = None; 252 style = `Block; 253 }, tok.span 254 255 | Token.Block_sequence_start when block -> 256 t.state <- Block_sequence_first_entry; 257 skip_token t; 258 Event.Sequence_start { 259 anchor; tag; 260 implicit = tag = None; 261 style = `Block; 262 }, tok.span 263 264 | Token.Block_mapping_start when block -> 265 t.state <- Block_mapping_first_key; 266 skip_token t; 267 Event.Mapping_start { 268 anchor; tag; 269 implicit = tag = None; 270 style = `Block; 271 }, tok.span 272 273 | Token.Flow_sequence_start -> 274 t.state <- Flow_sequence_first_entry; 275 skip_token t; 276 Event.Sequence_start { 277 anchor; tag; 278 implicit = tag = None; 279 style = `Flow; 280 }, tok.span 281 282 | Token.Flow_mapping_start -> 283 t.state <- Flow_mapping_first_key; 284 skip_token t; 285 Event.Mapping_start { 286 anchor; tag; 287 implicit = tag = None; 288 style = `Flow; 289 }, tok.span 290 291 | Token.Scalar { style; value } -> 292 skip_token t; 293 t.state <- pop_state t; 294 let plain_implicit = tag = None && style = `Plain in 295 let quoted_implicit = tag = None && style <> `Plain in 296 Event.Scalar { 297 anchor; tag; value; 298 plain_implicit; quoted_implicit; style; 299 }, tok.span 300 301 | _ -> 302 (* Empty node *) 303 t.state <- pop_state t; 304 empty_scalar_event ~anchor ~tag tok.span) 305 306 | Token.Block_sequence_start when block -> 307 t.state <- Block_sequence_first_entry; 308 skip_token t; 309 Event.Sequence_start { 310 anchor = None; tag = None; 311 implicit = true; 312 style = `Block; 313 }, tok.span 314 315 | Token.Block_mapping_start when block -> 316 t.state <- Block_mapping_first_key; 317 skip_token t; 318 Event.Mapping_start { 319 anchor = None; tag = None; 320 implicit = true; 321 style = `Block; 322 }, tok.span 323 324 | Token.Flow_sequence_start -> 325 t.state <- Flow_sequence_first_entry; 326 skip_token t; 327 Event.Sequence_start { 328 anchor = None; tag = None; 329 implicit = true; 330 style = `Flow; 331 }, tok.span 332 333 | Token.Flow_mapping_start -> 334 t.state <- Flow_mapping_first_key; 335 skip_token t; 336 Event.Mapping_start { 337 anchor = None; tag = None; 338 implicit = true; 339 style = `Flow; 340 }, tok.span 341 342 | Token.Block_entry when indentless -> 343 t.state <- Indentless_sequence_entry; 344 Event.Sequence_start { 345 anchor = None; tag = None; 346 implicit = true; 347 style = `Block; 348 }, tok.span 349 350 | Token.Scalar { style; value } -> 351 skip_token t; 352 t.state <- pop_state t; 353 let plain_implicit = style = `Plain in 354 let quoted_implicit = style <> `Plain in 355 Event.Scalar { 356 anchor = None; tag = None; value; 357 plain_implicit; quoted_implicit; style; 358 }, tok.span 359 360 | _ -> 361 (* Empty node *) 362 t.state <- pop_state t; 363 empty_scalar_event ~anchor:None ~tag:None tok.span 364 365(** Parse block sequence entry *) 366let parse_block_sequence_entry t = 367 let tok = current_token t in 368 match tok.token with 369 | Token.Block_entry -> 370 skip_token t; 371 if check t (function 372 | Token.Block_entry | Token.Block_end -> true 373 | _ -> false) 374 then begin 375 t.state <- Block_sequence_entry; 376 empty_scalar_event ~anchor:None ~tag:None tok.span 377 end else begin 378 push_state t Block_sequence_entry; 379 parse_node t ~block:true ~indentless:false 380 end 381 | Token.Block_end -> 382 skip_token t; 383 t.state <- pop_state t; 384 Event.Sequence_end, tok.span 385 | _ -> 386 Error.raise_span tok.span Expected_block_entry 387 388(** Parse block mapping key *) 389let parse_block_mapping_key t = 390 let tok = current_token t in 391 match tok.token with 392 | Token.Key -> 393 skip_token t; 394 if check t (function 395 | Token.Key | Token.Value | Token.Block_end -> true 396 | _ -> false) 397 then begin 398 t.state <- Block_mapping_value; 399 empty_scalar_event ~anchor:None ~tag:None tok.span 400 end else begin 401 push_state t Block_mapping_value; 402 parse_node t ~block:true ~indentless:true 403 end 404 (* Handle value without explicit key - key is empty/null *) 405 | Token.Value -> 406 t.state <- Block_mapping_value; 407 empty_scalar_event ~anchor:None ~tag:None tok.span 408 | Token.Block_end -> 409 skip_token t; 410 t.state <- pop_state t; 411 Event.Mapping_end, tok.span 412 | _ -> 413 Error.raise_span tok.span Expected_key 414 415(** Parse block mapping value *) 416let parse_block_mapping_value t = 417 let tok = current_token t in 418 match tok.token with 419 | Token.Value -> 420 skip_token t; 421 if check t (function 422 | Token.Key | Token.Value | Token.Block_end -> true 423 | _ -> false) 424 then begin 425 t.state <- Block_mapping_key; 426 empty_scalar_event ~anchor:None ~tag:None tok.span 427 end else begin 428 push_state t Block_mapping_key; 429 parse_node t ~block:true ~indentless:true 430 end 431 | _ -> 432 (* Implicit empty value *) 433 t.state <- Block_mapping_key; 434 empty_scalar_event ~anchor:None ~tag:None tok.span 435 436(** Parse indentless sequence entry *) 437let parse_indentless_sequence_entry t = 438 let tok = current_token t in 439 match tok.token with 440 | Token.Block_entry -> 441 skip_token t; 442 if check t (function 443 | Token.Block_entry | Token.Key | Token.Value | Token.Block_end -> true 444 | _ -> false) 445 then begin 446 t.state <- Indentless_sequence_entry; 447 empty_scalar_event ~anchor:None ~tag:None tok.span 448 end else begin 449 push_state t Indentless_sequence_entry; 450 parse_node t ~block:true ~indentless:false 451 end 452 | _ -> 453 t.state <- pop_state t; 454 Event.Sequence_end, tok.span 455 456(** Parse flow sequence *) 457let rec parse_flow_sequence_entry t ~first = 458 let tok = current_token t in 459 match tok.token with 460 | Token.Flow_sequence_end -> 461 skip_token t; 462 t.state <- pop_state t; 463 Event.Sequence_end, tok.span 464 | Token.Flow_entry when not first -> 465 skip_token t; 466 parse_flow_sequence_entry_internal t 467 | _ when first -> 468 parse_flow_sequence_entry_internal t 469 | _ -> 470 Error.raise_span tok.span Expected_sequence_end 471 472and parse_flow_sequence_entry_internal t = 473 let tok = current_token t in 474 match tok.token with 475 | Token.Flow_sequence_end -> 476 (* Trailing comma case - don't emit empty scalar, just go back to sequence entry state *) 477 skip_token t; 478 t.state <- pop_state t; 479 Event.Sequence_end, tok.span 480 | Token.Flow_entry -> 481 (* Double comma or comma after comma - invalid *) 482 Error.raise_span tok.span (Unexpected_token "unexpected ',' in flow sequence") 483 | Token.Key -> 484 skip_token t; 485 t.state <- Flow_sequence_entry_mapping_key; 486 Event.Mapping_start { 487 anchor = None; tag = None; 488 implicit = true; 489 style = `Flow; 490 }, tok.span 491 | Token.Value -> 492 (* Implicit empty key mapping: [ : value ] *) 493 t.state <- Flow_sequence_entry_mapping_key; 494 Event.Mapping_start { 495 anchor = None; tag = None; 496 implicit = true; 497 style = `Flow; 498 }, tok.span 499 | _ -> 500 push_state t Flow_sequence_entry; 501 parse_node t ~block:false ~indentless:false 502 503(** Parse flow sequence entry mapping *) 504let parse_flow_sequence_entry_mapping_key t = 505 let tok = current_token t in 506 if check t (function 507 | Token.Value | Token.Flow_entry | Token.Flow_sequence_end -> true 508 | _ -> false) 509 then begin 510 t.state <- Flow_sequence_entry_mapping_value; 511 empty_scalar_event ~anchor:None ~tag:None tok.span 512 end else begin 513 push_state t Flow_sequence_entry_mapping_value; 514 parse_node t ~block:false ~indentless:false 515 end 516 517let parse_flow_sequence_entry_mapping_value t = 518 let tok = current_token t in 519 match tok.token with 520 | Token.Value -> 521 skip_token t; 522 if check t (function 523 | Token.Flow_entry | Token.Flow_sequence_end -> true 524 | _ -> false) 525 then begin 526 t.state <- Flow_sequence_entry_mapping_end; 527 empty_scalar_event ~anchor:None ~tag:None tok.span 528 end else begin 529 push_state t Flow_sequence_entry_mapping_end; 530 parse_node t ~block:false ~indentless:false 531 end 532 | _ -> 533 t.state <- Flow_sequence_entry_mapping_end; 534 empty_scalar_event ~anchor:None ~tag:None tok.span 535 536let parse_flow_sequence_entry_mapping_end t = 537 let tok = current_token t in 538 t.state <- Flow_sequence_entry; 539 Event.Mapping_end, tok.span 540 541(** Parse flow mapping *) 542let rec parse_flow_mapping_key t ~first = 543 let tok = current_token t in 544 match tok.token with 545 | Token.Flow_mapping_end -> 546 skip_token t; 547 t.state <- pop_state t; 548 Event.Mapping_end, tok.span 549 | Token.Flow_entry when not first -> 550 skip_token t; 551 parse_flow_mapping_key_internal t 552 | _ when first -> 553 parse_flow_mapping_key_internal t 554 | _ -> 555 Error.raise_span tok.span Expected_mapping_end 556 557and parse_flow_mapping_key_internal t = 558 let tok = current_token t in 559 match tok.token with 560 | Token.Flow_mapping_end -> 561 (* Trailing comma case - don't emit empty scalar, just return to key state *) 562 skip_token t; 563 t.state <- pop_state t; 564 Event.Mapping_end, tok.span 565 | Token.Flow_entry -> 566 (* Double comma or comma after comma - invalid *) 567 Error.raise_span tok.span (Unexpected_token "unexpected ',' in flow mapping") 568 | Token.Key -> 569 skip_token t; 570 if check t (function 571 | Token.Value | Token.Flow_entry | Token.Flow_mapping_end -> true 572 | _ -> false) 573 then begin 574 t.state <- Flow_mapping_value; 575 empty_scalar_event ~anchor:None ~tag:None tok.span 576 end else begin 577 push_state t Flow_mapping_value; 578 parse_node t ~block:false ~indentless:false 579 end 580 | _ -> 581 push_state t Flow_mapping_value; 582 parse_node t ~block:false ~indentless:false 583 584let parse_flow_mapping_value t ~empty = 585 let tok = current_token t in 586 if empty then begin 587 t.state <- Flow_mapping_key; 588 empty_scalar_event ~anchor:None ~tag:None tok.span 589 end else 590 match tok.token with 591 | Token.Value -> 592 skip_token t; 593 if check t (function 594 | Token.Flow_entry | Token.Flow_mapping_end -> true 595 | _ -> false) 596 then begin 597 t.state <- Flow_mapping_key; 598 empty_scalar_event ~anchor:None ~tag:None tok.span 599 end else begin 600 push_state t Flow_mapping_key; 601 parse_node t ~block:false ~indentless:false 602 end 603 | _ -> 604 t.state <- Flow_mapping_key; 605 empty_scalar_event ~anchor:None ~tag:None tok.span 606 607(** Main state machine dispatcher *) 608let rec parse t = 609 match t.state with 610 | Stream_start -> 611 parse_stream_start t 612 613 | Implicit_document_start -> 614 (* Skip any document end markers before checking what's next *) 615 while check t (function Token.Document_end -> true | _ -> false) do 616 t.explicit_doc_end <- true; (* Seeing ... counts as explicit end *) 617 skip_token t 618 done; 619 620 let tok = current_token t in 621 (match tok.token with 622 | Token.Stream_end -> 623 skip_token t; 624 t.state <- End; 625 t.finished <- true; 626 Event.Stream_end, tok.span 627 | Token.Version_directive _ | Token.Tag_directive _ -> 628 (* Directives are only allowed at stream start or after explicit ... (MUS6/01) *) 629 if not t.stream_start && not t.explicit_doc_end then 630 Error.raise_span tok.span (Invalid_directive "directives require explicit document end '...' before them"); 631 parse_document_start t ~implicit:false 632 | Token.Document_start -> 633 parse_document_start t ~implicit:false 634 (* These tokens are invalid at document start - they indicate leftover junk *) 635 | Token.Flow_sequence_end | Token.Flow_mapping_end | Token.Flow_entry 636 | Token.Block_end | Token.Value -> 637 Error.raise_span tok.span (Unexpected_token "unexpected token at document start") 638 | _ -> 639 parse_document_start t ~implicit:true) 640 641 | Document_content -> 642 if check t (function 643 | Token.Version_directive _ | Token.Tag_directive _ 644 | Token.Document_start | Token.Document_end | Token.Stream_end -> true 645 | _ -> false) 646 then begin 647 let tok = current_token t in 648 t.state <- pop_state t; 649 empty_scalar_event ~anchor:None ~tag:None tok.span 650 end else begin 651 (* Push Document_content_done so we return there after parsing the node. 652 This allows us to check for unexpected content after the node. *) 653 push_state t Document_content_done; 654 parse_node t ~block:true ~indentless:false 655 end 656 657 | Document_content_done -> 658 (* After parsing a node in document content, check for unexpected content *) 659 if check t (function 660 | Token.Version_directive _ | Token.Tag_directive _ 661 | Token.Document_start | Token.Document_end | Token.Stream_end -> true 662 | _ -> false) 663 then begin 664 (* Valid document boundary - continue to Document_end *) 665 t.state <- pop_state t; 666 parse t (* Continue to emit the next event *) 667 end else begin 668 (* Unexpected content after document value - this is an error (KS4U, BS4K) *) 669 let tok = current_token t in 670 Error.raise_span tok.span 671 (Unexpected_token "content not allowed after document value") 672 end 673 674 | Document_end -> 675 parse_document_end t 676 677 | Block_sequence_first_entry -> 678 t.state <- Block_sequence_entry; 679 parse_block_sequence_entry t 680 681 | Block_sequence_entry -> 682 parse_block_sequence_entry t 683 684 | Indentless_sequence_entry -> 685 parse_indentless_sequence_entry t 686 687 | Block_mapping_first_key -> 688 t.state <- Block_mapping_key; 689 parse_block_mapping_key t 690 691 | Block_mapping_key -> 692 parse_block_mapping_key t 693 694 | Block_mapping_value -> 695 parse_block_mapping_value t 696 697 | Flow_sequence_first_entry -> 698 parse_flow_sequence_entry t ~first:true 699 700 | Flow_sequence_entry -> 701 parse_flow_sequence_entry t ~first:false 702 703 | Flow_sequence_entry_mapping_key -> 704 parse_flow_sequence_entry_mapping_key t 705 706 | Flow_sequence_entry_mapping_value -> 707 parse_flow_sequence_entry_mapping_value t 708 709 | Flow_sequence_entry_mapping_end -> 710 parse_flow_sequence_entry_mapping_end t 711 712 | Flow_mapping_first_key -> 713 parse_flow_mapping_key t ~first:true 714 715 | Flow_mapping_key -> 716 parse_flow_mapping_key t ~first:false 717 718 | Flow_mapping_value -> 719 parse_flow_mapping_value t ~empty:false 720 721 | End -> 722 let span = Span.point Position.initial in 723 t.finished <- true; 724 Event.Stream_end, span 725 726(** Get next event *) 727let next t = 728 if t.finished then None 729 else 730 let event, span = parse t in 731 Some { Event.event; span } 732 733(** Iterate over all events *) 734let iter f t = 735 let rec loop () = 736 match next t with 737 | None -> () 738 | Some ev -> f ev; loop () 739 in 740 loop () 741 742(** Fold over all events *) 743let fold f init t = 744 let rec loop acc = 745 match next t with 746 | None -> acc 747 | Some ev -> loop (f acc ev) 748 in 749 loop init 750 751(** Convert to list *) 752let to_list t = 753 fold (fun acc ev -> ev :: acc) [] t |> List.rev