Pure OCaml Yaml 1.2 reader and writer using Bytesrw
at main 23 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** YAML parser - converts tokens to semantic events via state machine *) 7 8(** Parser states *) 9type state = 10 | Stream_start 11 | Implicit_document_start 12 | Document_content 13 | Document_content_done 14 (* After parsing a node, check for unexpected content *) 15 | Document_end 16 | Block_sequence_first_entry 17 | Block_sequence_entry 18 | Indentless_sequence_entry 19 | Block_mapping_first_key 20 | Block_mapping_key 21 | Block_mapping_value 22 | Flow_sequence_first_entry 23 | Flow_sequence_entry 24 | Flow_sequence_entry_mapping_key 25 | Flow_sequence_entry_mapping_value 26 | Flow_sequence_entry_mapping_end 27 | Flow_mapping_first_key 28 | Flow_mapping_key 29 | Flow_mapping_value 30 | End 31 32type t = { 33 scanner : Scanner.t; 34 mutable state : state; 35 mutable states : state list; (** State stack *) 36 mutable version : (int * int) option; 37 mutable tag_directives : (string * string) list; 38 mutable current_token : Token.spanned option; 39 mutable finished : bool; 40 mutable explicit_doc_end : bool; 41 (** True if last doc ended with explicit ... *) 42 mutable stream_start : bool; 43 (** True if we haven't emitted any documents yet *) 44} 45 46let create scanner = 47 { 48 scanner; 49 state = Stream_start; 50 states = []; 51 version = None; 52 tag_directives = [ ("!", "!"); ("!!", "tag:yaml.org,2002:") ]; 53 current_token = None; 54 finished = false; 55 explicit_doc_end = false; 56 stream_start = true; 57 } 58 59let of_string s = create (Scanner.of_string s) 60let of_scanner = create 61let of_input i = create (Scanner.of_input i) 62let of_reader r = create (Scanner.of_reader r) 63 64(** Get current token, fetching if needed *) 65let current_token t = 66 match t.current_token with 67 | Some tok -> tok 68 | None -> ( 69 let tok = Scanner.next t.scanner in 70 t.current_token <- tok; 71 match tok with Some tok -> tok | None -> Error.raise Unexpected_eof) 72 73(** Peek at current token *) 74let peek_token t = 75 match t.current_token with 76 | Some _ -> t.current_token 77 | None -> 78 t.current_token <- Scanner.next t.scanner; 79 t.current_token 80 81(** Skip current token *) 82let skip_token t = t.current_token <- None 83 84(** Check if current token matches predicate *) 85let check t pred = 86 match peek_token t with Some tok -> pred tok.token | None -> false 87 88 89(** Push state onto stack *) 90let push_state t s = t.states <- s :: t.states 91 92(** Pop state from stack *) 93let pop_state t = 94 match t.states with 95 | s :: rest -> 96 t.states <- rest; 97 s 98 | [] -> End 99 100(** Resolve a tag *) 101let resolve_tag t ~handle ~suffix = 102 if handle = "" then 103 (* Verbatim tag - suffix is already the full URI *) 104 suffix 105 else 106 match List.assoc_opt handle t.tag_directives with 107 | Some prefix -> prefix ^ suffix 108 | None when handle = "!" -> "!" ^ suffix 109 | None -> Error.raise (Invalid_tag (handle ^ suffix)) 110 111(** Process directives at document start *) 112let process_directives t = 113 t.version <- None; 114 t.tag_directives <- [ ("!", "!"); ("!!", "tag:yaml.org,2002:") ]; 115 116 while 117 check t (function 118 | Token.Version_directive _ | Token.Tag_directive _ -> true 119 | _ -> false) 120 do 121 let tok = current_token t in 122 skip_token t; 123 match tok.token with 124 | Token.Version_directive { major; minor } -> 125 if t.version <> None then 126 Error.raise_span tok.span 127 (Invalid_yaml_version "duplicate YAML directive"); 128 t.version <- Some (major, minor) 129 | Token.Tag_directive { handle; prefix } -> 130 (* Skip empty tag directives (these are reserved/unknown directives that were ignored) *) 131 if handle = "" && prefix = "" then () (* Ignore reserved directives *) 132 else begin 133 if 134 List.mem_assoc handle t.tag_directives 135 && handle <> "!" && handle <> "!!" 136 then 137 Error.raise_span tok.span 138 (Invalid_tag_directive ("duplicate tag handle: " ^ handle)); 139 t.tag_directives <- (handle, prefix) :: t.tag_directives 140 end 141 | _ -> () 142 done 143 144(** Parse anchor and/or tag properties *) 145let parse_properties t = 146 let anchor = ref None in 147 let tag = ref None in 148 149 while 150 check t (function Token.Anchor _ | Token.Tag _ -> true | _ -> false) 151 do 152 let tok = current_token t in 153 skip_token t; 154 match tok.token with 155 | Token.Anchor name -> 156 if Option.is_some !anchor then 157 Error.raise_span tok.span (Duplicate_anchor name); 158 anchor := Some name 159 | Token.Tag { handle; suffix } -> 160 if Option.is_some !tag then 161 Error.raise_span tok.span (Invalid_tag "duplicate tag"); 162 let resolved = 163 if handle = "" && suffix = "" then None 164 else if handle = "!" && suffix = "" then Some "!" 165 else Some (resolve_tag t ~handle ~suffix) 166 in 167 tag := resolved 168 | _ -> () 169 done; 170 (!anchor, !tag) 171 172(** Empty scalar event *) 173let empty_scalar_event ~anchor ~tag span = 174 ( Event.Scalar 175 { 176 anchor; 177 tag; 178 value = ""; 179 plain_implicit = tag = None; 180 quoted_implicit = false; 181 style = `Plain; 182 }, 183 span ) 184 185(** Parse stream start *) 186let parse_stream_start t = 187 let tok = current_token t in 188 skip_token t; 189 match tok.token with 190 | Token.Stream_start encoding -> 191 t.state <- Implicit_document_start; 192 (Event.Stream_start { encoding }, tok.span) 193 | _ -> Error.raise_span tok.span (Unexpected_token "expected stream start") 194 195(** Parse document start (implicit or explicit) *) 196let parse_document_start t ~implicit = 197 process_directives t; 198 199 if not implicit then begin 200 let tok = current_token t in 201 match tok.token with 202 | Token.Document_start -> skip_token t 203 | _ -> Error.raise_span tok.span Expected_document_start 204 end; 205 206 let span = 207 match peek_token t with 208 | Some tok -> tok.span 209 | None -> Span.point Position.initial 210 in 211 212 (* After first document, stream_start is false *) 213 t.stream_start <- false; 214 push_state t Document_end; 215 t.state <- Document_content; 216 (Event.Document_start { version = t.version; implicit }, span) 217 218(** Parse document end *) 219let parse_document_end t = 220 let implicit = 221 not (check t (function Token.Document_end -> true | _ -> false)) 222 in 223 let span = 224 match peek_token t with 225 | Some tok -> tok.span 226 | None -> Span.point Position.initial 227 in 228 229 if not implicit then skip_token t; 230 231 (* Track if this document ended explicitly with ... *) 232 t.explicit_doc_end <- not implicit; 233 t.state <- Implicit_document_start; 234 (Event.Document_end { implicit }, span) 235 236(** Parse node in various contexts *) 237let parse_node t ~block ~indentless = 238 let tok = current_token t in 239 match tok.token with 240 | Token.Alias name -> 241 skip_token t; 242 t.state <- pop_state t; 243 (Event.Alias { anchor = name }, tok.span) 244 | Token.Anchor _ | Token.Tag _ -> ( 245 let anchor, tag = parse_properties t in 246 let tok = current_token t in 247 match tok.token with 248 | Token.Block_entry when indentless -> 249 t.state <- Indentless_sequence_entry; 250 ( Event.Sequence_start 251 { anchor; tag; implicit = tag = None; style = `Block }, 252 tok.span ) 253 | Token.Block_sequence_start when block -> 254 t.state <- Block_sequence_first_entry; 255 skip_token t; 256 ( Event.Sequence_start 257 { anchor; tag; implicit = tag = None; style = `Block }, 258 tok.span ) 259 | Token.Block_mapping_start when block -> 260 t.state <- Block_mapping_first_key; 261 skip_token t; 262 ( Event.Mapping_start 263 { anchor; tag; implicit = tag = None; style = `Block }, 264 tok.span ) 265 | Token.Flow_sequence_start -> 266 t.state <- Flow_sequence_first_entry; 267 skip_token t; 268 ( Event.Sequence_start 269 { anchor; tag; implicit = tag = None; style = `Flow }, 270 tok.span ) 271 | Token.Flow_mapping_start -> 272 t.state <- Flow_mapping_first_key; 273 skip_token t; 274 ( Event.Mapping_start 275 { anchor; tag; implicit = tag = None; style = `Flow }, 276 tok.span ) 277 | Token.Scalar { style; value } -> 278 skip_token t; 279 t.state <- pop_state t; 280 let plain_implicit = tag = None && style = `Plain in 281 let quoted_implicit = tag = None && style <> `Plain in 282 ( Event.Scalar 283 { anchor; tag; value; plain_implicit; quoted_implicit; style }, 284 tok.span ) 285 | _ -> 286 (* Empty node *) 287 t.state <- pop_state t; 288 empty_scalar_event ~anchor ~tag tok.span) 289 | Token.Block_sequence_start when block -> 290 t.state <- Block_sequence_first_entry; 291 skip_token t; 292 ( Event.Sequence_start 293 { anchor = None; tag = None; implicit = true; style = `Block }, 294 tok.span ) 295 | Token.Block_mapping_start when block -> 296 t.state <- Block_mapping_first_key; 297 skip_token t; 298 ( Event.Mapping_start 299 { anchor = None; tag = None; implicit = true; style = `Block }, 300 tok.span ) 301 | Token.Flow_sequence_start -> 302 t.state <- Flow_sequence_first_entry; 303 skip_token t; 304 ( Event.Sequence_start 305 { anchor = None; tag = None; implicit = true; style = `Flow }, 306 tok.span ) 307 | Token.Flow_mapping_start -> 308 t.state <- Flow_mapping_first_key; 309 skip_token t; 310 ( Event.Mapping_start 311 { anchor = None; tag = None; implicit = true; style = `Flow }, 312 tok.span ) 313 | Token.Block_entry when indentless -> 314 t.state <- Indentless_sequence_entry; 315 ( Event.Sequence_start 316 { anchor = None; tag = None; implicit = true; style = `Block }, 317 tok.span ) 318 | Token.Scalar { style; value } -> 319 skip_token t; 320 t.state <- pop_state t; 321 let plain_implicit = style = `Plain in 322 let quoted_implicit = style <> `Plain in 323 ( Event.Scalar 324 { 325 anchor = None; 326 tag = None; 327 value; 328 plain_implicit; 329 quoted_implicit; 330 style; 331 }, 332 tok.span ) 333 | _ -> 334 (* Empty node *) 335 t.state <- pop_state t; 336 empty_scalar_event ~anchor:None ~tag:None tok.span 337 338(** Parse block sequence entry *) 339let parse_block_sequence_entry t = 340 let tok = current_token t in 341 match tok.token with 342 | Token.Block_entry -> 343 skip_token t; 344 if 345 check t (function 346 | Token.Block_entry | Token.Block_end -> true 347 | _ -> false) 348 then begin 349 t.state <- Block_sequence_entry; 350 empty_scalar_event ~anchor:None ~tag:None tok.span 351 end 352 else begin 353 push_state t Block_sequence_entry; 354 parse_node t ~block:true ~indentless:false 355 end 356 | Token.Block_end -> 357 skip_token t; 358 t.state <- pop_state t; 359 (Event.Sequence_end, tok.span) 360 | _ -> Error.raise_span tok.span Expected_block_entry 361 362(** Parse block mapping key *) 363let parse_block_mapping_key t = 364 let tok = current_token t in 365 match tok.token with 366 | Token.Key -> 367 skip_token t; 368 if 369 check t (function 370 | Token.Key | Token.Value | Token.Block_end -> true 371 | _ -> false) 372 then begin 373 t.state <- Block_mapping_value; 374 empty_scalar_event ~anchor:None ~tag:None tok.span 375 end 376 else begin 377 push_state t Block_mapping_value; 378 parse_node t ~block:true ~indentless:true 379 end 380 (* Handle value without explicit key - key is empty/null *) 381 | Token.Value -> 382 t.state <- Block_mapping_value; 383 empty_scalar_event ~anchor:None ~tag:None tok.span 384 | Token.Block_end -> 385 skip_token t; 386 t.state <- pop_state t; 387 (Event.Mapping_end, tok.span) 388 | _ -> Error.raise_span tok.span Expected_key 389 390(** Parse block mapping value *) 391let parse_block_mapping_value t = 392 let tok = current_token t in 393 match tok.token with 394 | Token.Value -> 395 skip_token t; 396 if 397 check t (function 398 | Token.Key | Token.Value | Token.Block_end -> true 399 | _ -> false) 400 then begin 401 t.state <- Block_mapping_key; 402 empty_scalar_event ~anchor:None ~tag:None tok.span 403 end 404 else begin 405 push_state t Block_mapping_key; 406 parse_node t ~block:true ~indentless:true 407 end 408 | _ -> 409 (* Implicit empty value *) 410 t.state <- Block_mapping_key; 411 empty_scalar_event ~anchor:None ~tag:None tok.span 412 413(** Parse indentless sequence entry *) 414let parse_indentless_sequence_entry t = 415 let tok = current_token t in 416 match tok.token with 417 | Token.Block_entry -> 418 skip_token t; 419 if 420 check t (function 421 | Token.Block_entry | Token.Key | Token.Value | Token.Block_end -> 422 true 423 | _ -> false) 424 then begin 425 t.state <- Indentless_sequence_entry; 426 empty_scalar_event ~anchor:None ~tag:None tok.span 427 end 428 else begin 429 push_state t Indentless_sequence_entry; 430 parse_node t ~block:true ~indentless:false 431 end 432 | _ -> 433 t.state <- pop_state t; 434 (Event.Sequence_end, tok.span) 435 436(** Parse flow sequence *) 437let rec parse_flow_sequence_entry t ~first = 438 let tok = current_token t in 439 match tok.token with 440 | Token.Flow_sequence_end -> 441 skip_token t; 442 t.state <- pop_state t; 443 (Event.Sequence_end, tok.span) 444 | Token.Flow_entry when not first -> 445 skip_token t; 446 parse_flow_sequence_entry_internal t 447 | _ when first -> parse_flow_sequence_entry_internal t 448 | _ -> Error.raise_span tok.span Expected_sequence_end 449 450and parse_flow_sequence_entry_internal t = 451 let tok = current_token t in 452 match tok.token with 453 | Token.Flow_sequence_end -> 454 (* Trailing comma case - don't emit empty scalar, just go back to sequence entry state *) 455 skip_token t; 456 t.state <- pop_state t; 457 (Event.Sequence_end, tok.span) 458 | Token.Flow_entry -> 459 (* Double comma or comma after comma - invalid *) 460 Error.raise_span tok.span 461 (Unexpected_token "unexpected ',' in flow sequence") 462 | Token.Key -> 463 skip_token t; 464 t.state <- Flow_sequence_entry_mapping_key; 465 ( Event.Mapping_start 466 { anchor = None; tag = None; implicit = true; style = `Flow }, 467 tok.span ) 468 | Token.Value -> 469 (* Implicit empty key mapping: [ : value ] *) 470 t.state <- Flow_sequence_entry_mapping_key; 471 ( Event.Mapping_start 472 { anchor = None; tag = None; implicit = true; style = `Flow }, 473 tok.span ) 474 | _ -> 475 push_state t Flow_sequence_entry; 476 parse_node t ~block:false ~indentless:false 477 478(** Parse flow sequence entry mapping *) 479let parse_flow_sequence_entry_mapping_key t = 480 let tok = current_token t in 481 if 482 check t (function 483 | Token.Value | Token.Flow_entry | Token.Flow_sequence_end -> true 484 | _ -> false) 485 then begin 486 t.state <- Flow_sequence_entry_mapping_value; 487 empty_scalar_event ~anchor:None ~tag:None tok.span 488 end 489 else begin 490 push_state t Flow_sequence_entry_mapping_value; 491 parse_node t ~block:false ~indentless:false 492 end 493 494let parse_flow_sequence_entry_mapping_value t = 495 let tok = current_token t in 496 match tok.token with 497 | Token.Value -> 498 skip_token t; 499 if 500 check t (function 501 | Token.Flow_entry | Token.Flow_sequence_end -> true 502 | _ -> false) 503 then begin 504 t.state <- Flow_sequence_entry_mapping_end; 505 empty_scalar_event ~anchor:None ~tag:None tok.span 506 end 507 else begin 508 push_state t Flow_sequence_entry_mapping_end; 509 parse_node t ~block:false ~indentless:false 510 end 511 | _ -> 512 t.state <- Flow_sequence_entry_mapping_end; 513 empty_scalar_event ~anchor:None ~tag:None tok.span 514 515let parse_flow_sequence_entry_mapping_end t = 516 let tok = current_token t in 517 t.state <- Flow_sequence_entry; 518 (Event.Mapping_end, tok.span) 519 520(** Parse flow mapping *) 521let rec parse_flow_mapping_key t ~first = 522 let tok = current_token t in 523 match tok.token with 524 | Token.Flow_mapping_end -> 525 skip_token t; 526 t.state <- pop_state t; 527 (Event.Mapping_end, tok.span) 528 | Token.Flow_entry when not first -> 529 skip_token t; 530 parse_flow_mapping_key_internal t 531 | _ when first -> parse_flow_mapping_key_internal t 532 | _ -> Error.raise_span tok.span Expected_mapping_end 533 534and parse_flow_mapping_key_internal t = 535 let tok = current_token t in 536 match tok.token with 537 | Token.Flow_mapping_end -> 538 (* Trailing comma case - don't emit empty scalar, just return to key state *) 539 skip_token t; 540 t.state <- pop_state t; 541 (Event.Mapping_end, tok.span) 542 | Token.Flow_entry -> 543 (* Double comma or comma after comma - invalid *) 544 Error.raise_span tok.span 545 (Unexpected_token "unexpected ',' in flow mapping") 546 | Token.Key -> 547 skip_token t; 548 if 549 check t (function 550 | Token.Value | Token.Flow_entry | Token.Flow_mapping_end -> true 551 | _ -> false) 552 then begin 553 t.state <- Flow_mapping_value; 554 empty_scalar_event ~anchor:None ~tag:None tok.span 555 end 556 else begin 557 push_state t Flow_mapping_value; 558 parse_node t ~block:false ~indentless:false 559 end 560 | _ -> 561 push_state t Flow_mapping_value; 562 parse_node t ~block:false ~indentless:false 563 564let parse_flow_mapping_value t ~empty = 565 let tok = current_token t in 566 if empty then begin 567 t.state <- Flow_mapping_key; 568 empty_scalar_event ~anchor:None ~tag:None tok.span 569 end 570 else 571 match tok.token with 572 | Token.Value -> 573 skip_token t; 574 if 575 check t (function 576 | Token.Flow_entry | Token.Flow_mapping_end -> true 577 | _ -> false) 578 then begin 579 t.state <- Flow_mapping_key; 580 empty_scalar_event ~anchor:None ~tag:None tok.span 581 end 582 else begin 583 push_state t Flow_mapping_key; 584 parse_node t ~block:false ~indentless:false 585 end 586 | _ -> 587 t.state <- Flow_mapping_key; 588 empty_scalar_event ~anchor:None ~tag:None tok.span 589 590(** Main state machine dispatcher *) 591let rec parse t = 592 match t.state with 593 | Stream_start -> parse_stream_start t 594 | Implicit_document_start -> ( 595 (* Skip any document end markers before checking what's next *) 596 while check t (function Token.Document_end -> true | _ -> false) do 597 t.explicit_doc_end <- true; 598 (* Seeing ... counts as explicit end *) 599 skip_token t 600 done; 601 602 let tok = current_token t in 603 match tok.token with 604 | Token.Stream_end -> 605 skip_token t; 606 t.state <- End; 607 t.finished <- true; 608 (Event.Stream_end, tok.span) 609 | Token.Version_directive _ | Token.Tag_directive _ -> 610 (* Directives are only allowed at stream start or after explicit ... (MUS6/01) *) 611 if (not t.stream_start) && not t.explicit_doc_end then 612 Error.raise_span tok.span 613 (Invalid_directive 614 "directives require explicit document end '...' before them"); 615 parse_document_start t ~implicit:false 616 | Token.Document_start -> parse_document_start t ~implicit:false 617 (* These tokens are invalid at document start - they indicate leftover junk *) 618 | Token.Flow_sequence_end | Token.Flow_mapping_end | Token.Flow_entry 619 | Token.Block_end | Token.Value -> 620 Error.raise_span tok.span 621 (Unexpected_token "unexpected token at document start") 622 | _ -> parse_document_start t ~implicit:true) 623 624 | Document_content -> 625 if 626 check t (function 627 | Token.Version_directive _ | Token.Tag_directive _ 628 | Token.Document_start | Token.Document_end | Token.Stream_end -> 629 true 630 | _ -> false) 631 then begin 632 let tok = current_token t in 633 t.state <- pop_state t; 634 empty_scalar_event ~anchor:None ~tag:None tok.span 635 end 636 else begin 637 (* Push Document_content_done so we return there after parsing the node. 638 This allows us to check for unexpected content after the node. *) 639 push_state t Document_content_done; 640 parse_node t ~block:true ~indentless:false 641 end 642 | Document_content_done -> 643 (* After parsing a node in document content, check for unexpected content *) 644 if 645 check t (function 646 | Token.Version_directive _ | Token.Tag_directive _ 647 | Token.Document_start | Token.Document_end | Token.Stream_end -> 648 true 649 | _ -> false) 650 then begin 651 (* Valid document boundary - continue to Document_end *) 652 t.state <- pop_state t; 653 parse t (* Continue to emit the next event *) 654 end 655 else begin 656 (* Unexpected content after document value - this is an error (KS4U, BS4K) *) 657 let tok = current_token t in 658 Error.raise_span tok.span 659 (Unexpected_token "content not allowed after document value") 660 end 661 | Document_end -> parse_document_end t 662 663 | Block_sequence_first_entry -> 664 t.state <- Block_sequence_entry; 665 parse_block_sequence_entry t 666 | Block_sequence_entry -> parse_block_sequence_entry t 667 | Indentless_sequence_entry -> parse_indentless_sequence_entry t 668 | Block_mapping_first_key -> 669 t.state <- Block_mapping_key; 670 parse_block_mapping_key t 671 | Block_mapping_key -> parse_block_mapping_key t 672 | Block_mapping_value -> parse_block_mapping_value t 673 | Flow_sequence_first_entry -> parse_flow_sequence_entry t ~first:true 674 | Flow_sequence_entry -> parse_flow_sequence_entry t ~first:false 675 | Flow_sequence_entry_mapping_key -> parse_flow_sequence_entry_mapping_key t 676 | Flow_sequence_entry_mapping_value -> 677 parse_flow_sequence_entry_mapping_value t 678 | Flow_sequence_entry_mapping_end -> parse_flow_sequence_entry_mapping_end t 679 | Flow_mapping_first_key -> parse_flow_mapping_key t ~first:true 680 | Flow_mapping_key -> parse_flow_mapping_key t ~first:false 681 | Flow_mapping_value -> parse_flow_mapping_value t ~empty:false 682 683 | End -> 684 let span = Span.point Position.initial in 685 t.finished <- true; 686 (Event.Stream_end, span) 687 688(** Get next event *) 689let next t = 690 if t.finished then None 691 else 692 let event, span = parse t in 693 Some { Event.event; span } 694 695(** Iterate over all events *) 696let iter f t = 697 let rec loop () = 698 match next t with 699 | None -> () 700 | Some ev -> 701 f ev; 702 loop () 703 in 704 loop () 705 706(** Fold over all events *) 707let fold f init t = 708 let rec loop acc = 709 match next t with None -> acc | Some ev -> loop (f acc ev) 710 in 711 loop init 712 713(** Convert to list *) 714let to_list t = fold (fun acc ev -> ev :: acc) [] t |> List.rev