Pure OCaml Yaml 1.2 reader and writer using Bytesrw
at main 61 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *) 7 8type simple_key = { 9 sk_possible : bool; 10 sk_required : bool; 11 sk_token_number : int; 12 sk_position : Position.t; 13} 14(** Simple key tracking for mapping key disambiguation *) 15 16type indent = { 17 indent : int; 18 needs_block_end : bool; 19} 20(** Indent level tracking *) 21 22type t = { 23 input : Input.t; 24 tokens : Token.spanned Queue.t; 25 mutable token_number : int; 26 mutable tokens_taken : int; 27 mutable stream_started : bool; 28 mutable stream_ended : bool; 29 mutable indent_stack : indent list; (** Stack of indentation levels *) 30 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *) 31 mutable flow_indent : int; 32 (** Column where outermost flow collection started *) 33 mutable simple_keys : simple_key option list; 34 (** Per flow-level simple key tracking *) 35 mutable allow_simple_key : bool; 36 mutable leading_whitespace : bool; 37 (** True when at start of line (only whitespace seen) *) 38 mutable document_has_content : bool; 39 (** True if we've emitted content tokens in current document *) 40 mutable adjacent_value_allowed_at : Position.t option; 41 (** Position where adjacent : is allowed *) 42 mutable flow_mapping_stack : bool list; 43 (** Stack of whether each flow level is a mapping *) 44} 45 46let create input = 47 { 48 input; 49 tokens = Queue.create (); 50 token_number = 0; 51 tokens_taken = 0; 52 stream_started = false; 53 stream_ended = false; 54 indent_stack = []; 55 flow_level = 0; 56 flow_indent = 0; 57 simple_keys = [ None ]; 58 (* One entry for the base level *) 59 allow_simple_key = true; 60 leading_whitespace = true; 61 (* Start at beginning of stream *) 62 document_has_content = false; 63 adjacent_value_allowed_at = None; 64 flow_mapping_stack = []; 65 } 66 67let of_string s = create (Input.of_string s) 68let of_input = create 69let of_reader r = create (Input.of_reader r) 70let position t = Input.position t.input 71 72(** Add a token to the queue *) 73let emit t span token = 74 Queue.add { Token.token; span } t.tokens; 75 t.token_number <- t.token_number + 1 76 77(** Get current column (1-indexed) *) 78let column t = (Input.position t.input).column 79 80(** Get current indent level *) 81let current_indent t = 82 match t.indent_stack with [] -> -1 | { indent; _ } :: _ -> indent 83 84(** Skip whitespace to end of line, checking for valid comments. Returns true if 85 any whitespace (including tabs) was found before a comment. *) 86let skip_whitespace_and_comment t = 87 let has_whitespace = ref false in 88 (* Skip blanks (spaces and tabs) *) 89 while Input.next_is_blank t.input do 90 has_whitespace := true; 91 ignore (Input.next t.input) 92 done; 93 (* Check for comment *) 94 if Input.next_is (( = ) '#') t.input then begin 95 (* Validate: comment must be preceded by whitespace or be at start of line *) 96 if not !has_whitespace then begin 97 (* Check if we're at the start of input or after whitespace (blank or line break) *) 98 match Input.peek_back t.input with 99 | None -> () (* Start of input - OK *) 100 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *) 101 | _ -> 102 (* Comment not preceded by whitespace - ERROR *) 103 Error.raise_at (Input.mark t.input) Invalid_comment 104 end; 105 (* Skip to end of line *) 106 while (not (Input.is_eof t.input)) && not (Input.next_is_break t.input) do 107 ignore (Input.next t.input) 108 done 109 end 110 111(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *) 112let skip_blanks_check_tabs t = 113 let found_tab = ref false in 114 let found_space = ref false in 115 while Input.next_is_blank t.input do 116 (match Input.peek t.input with 117 | Some '\t' -> found_tab := true 118 | Some ' ' -> found_space := true 119 | _ -> ()); 120 ignore (Input.next t.input) 121 done; 122 (!found_tab, !found_space) 123 124(** Skip whitespace and comments, return true if at newline *) 125let rec skip_to_next_token t = 126 (* Check for tabs used as indentation in block context *) 127 (match Input.peek t.input with 128 | Some '\t' 129 when t.flow_level = 0 && t.leading_whitespace 130 && column t - 1 < current_indent t -> 131 (* Tab found in indentation zone - this is invalid *) 132 (* Skip to end of line to check if line has content *) 133 let start_pos = Input.mark t.input in 134 while Input.next_is_blank t.input do 135 ignore (Input.next t.input) 136 done; 137 (* If we have content on this line with a tab, raise error *) 138 if (not (Input.next_is_break t.input)) && not (Input.is_eof t.input) then 139 Error.raise_at start_pos Tab_in_indentation 140 | _ -> ()); 141 142 (* Skip blanks and validate comments *) 143 skip_whitespace_and_comment t; 144 (* Skip line break in block context *) 145 if t.flow_level = 0 && Input.next_is_break t.input then begin 146 Input.consume_break t.input; 147 t.allow_simple_key <- true; 148 t.leading_whitespace <- true; 149 skip_to_next_token t 150 end 151 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin 152 (* In flow context, skip all whitespace including line breaks *) 153 if Input.next_is_break t.input then begin 154 Input.consume_break t.input; 155 (* Allow simple keys after line breaks in flow context *) 156 t.allow_simple_key <- true; 157 (* After line break in flow, check for tabs at start of line (Y79Y/03) 158 Tabs are not allowed as indentation - if tab is first char and results 159 in a column less than flow_indent, it's an error *) 160 if Input.next_is (( = ) '\t') t.input then begin 161 (* Tab at start of line in flow context - skip tabs and check position *) 162 let start_mark = Input.mark t.input in 163 while Input.next_is (( = ) '\t') t.input do 164 ignore (Input.next t.input) 165 done; 166 (* If only tabs were used (no spaces) and column < flow_indent, error *) 167 if 168 (not (Input.next_is_break t.input)) 169 && (not (Input.is_eof t.input)) 170 && column t < t.flow_indent 171 then Error.raise_at start_mark Invalid_flow_indentation 172 end; 173 skip_to_next_token t 174 end 175 else begin 176 ignore (Input.next t.input); 177 skip_to_next_token t 178 end 179 end 180 181(** Roll the indentation level *) 182let roll_indent t col = 183 if t.flow_level = 0 && col > current_indent t then begin 184 t.indent_stack <- { indent = col; needs_block_end = true } :: t.indent_stack; 185 true 186 end 187 else false 188 189(** Unroll indentation to given column *) 190let unroll_indent t col = 191 while 192 t.flow_level = 0 193 && 194 match t.indent_stack with 195 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true 196 | _ -> false 197 do 198 match t.indent_stack with 199 | { indent = _; needs_block_end = true; _ } :: rest -> 200 let pos = Input.position t.input in 201 let span = Span.point pos in 202 emit t span Token.Block_end; 203 t.indent_stack <- rest 204 | _ -> () 205 done 206 207(** Save a potential simple key *) 208let save_simple_key t = 209 if t.allow_simple_key then begin 210 (* A simple key is required only if we're in a block context, 211 at the current indentation level, AND the current indent needs a block end. 212 This matches saphyr's logic and prevents false positives for values. *) 213 let required = 214 t.flow_level = 0 215 && 216 match t.indent_stack with 217 | { indent; needs_block_end = true; _ } :: _ -> indent = column t 218 | _ -> false 219 in 220 let sk = 221 { 222 sk_possible = true; 223 sk_required = required; 224 sk_token_number = t.token_number; 225 sk_position = Input.position t.input; 226 } 227 in 228 (* Remove any existing simple key at current level *) 229 t.simple_keys <- 230 (match t.simple_keys with 231 | _ :: rest -> Some sk :: rest 232 | [] -> [ Some sk ]) 233 end 234 235(** Remove simple key at current level *) 236let remove_simple_key t = 237 match t.simple_keys with 238 | Some sk :: _rest when sk.sk_required -> 239 Error.raise_at sk.sk_position Expected_key 240 | _ :: rest -> t.simple_keys <- None :: rest 241 | [] -> () 242 243(** Stale simple keys that span too many tokens *) 244let stale_simple_keys t = 245 t.simple_keys <- 246 List.map 247 (fun sk_opt -> 248 match sk_opt with 249 | Some sk 250 when sk.sk_possible 251 && (Input.position t.input).line > sk.sk_position.line 252 && t.flow_level = 0 -> 253 if sk.sk_required then Error.raise_at sk.sk_position Expected_key; 254 None 255 | _ -> sk_opt) 256 t.simple_keys 257 258(** Read anchor or alias name *) 259let scan_anchor_alias t = 260 let start = Input.mark t.input in 261 let buf = Buffer.create 16 in 262 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT: 263 - Whitespace (space, tab, line breaks) 264 - Flow indicators: []{} 265 - Comma (,) 266 This matches the saphyr implementation: is_yaml_non_space && !is_flow *) 267 while 268 match Input.peek t.input with 269 | Some c 270 when (not (Input.is_whitespace c)) 271 && (not (Input.is_flow_indicator c)) 272 && c <> '\x00' -> 273 Buffer.add_char buf c; 274 ignore (Input.next t.input); 275 true 276 | _ -> false 277 do 278 () 279 done; 280 let name = Buffer.contents buf in 281 if String.length name = 0 then 282 Error.raise_at start (Invalid_anchor "empty anchor name"); 283 (name, Span.make ~start ~stop:(Input.mark t.input)) 284 285(** Scan tag handle *) 286let scan_tag_handle t = 287 let start = Input.mark t.input in 288 let buf = Buffer.create 16 in 289 (* Expect ! *) 290 (match Input.peek t.input with 291 | Some '!' -> 292 Buffer.add_char buf '!'; 293 ignore (Input.next t.input) 294 | _ -> Error.raise_at start (Invalid_tag "expected '!'")); 295 (* Read word chars *) 296 while 297 match Input.peek t.input with 298 | Some c when Input.is_alnum c || c = '-' -> 299 Buffer.add_char buf c; 300 ignore (Input.next t.input); 301 true 302 | _ -> false 303 do 304 () 305 done; 306 (* Check for secondary ! *) 307 (match Input.peek t.input with 308 | Some '!' -> 309 Buffer.add_char buf '!'; 310 ignore (Input.next t.input) 311 | _ -> ()); 312 Buffer.contents buf 313 314(** Scan tag suffix (after handle) *) 315let scan_tag_suffix t = 316 let is_hex_digit c = 317 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') 318 in 319 let hex_val c = 320 match c with 321 | '0' .. '9' -> Char.code c - Char.code '0' 322 | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 323 | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 324 | _ -> 0 325 in 326 let buf = Buffer.create 32 in 327 while 328 match Input.peek t.input with 329 | Some '%' -> ( 330 (* Percent-encoded character *) 331 ignore (Input.next t.input); 332 match (Input.peek t.input, Input.peek_nth t.input 1) with 333 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 -> 334 ignore (Input.next t.input); 335 ignore (Input.next t.input); 336 let code = (hex_val c1 * 16) + hex_val c2 in 337 Buffer.add_char buf (Char.chr code); 338 true 339 | _ -> 340 (* Invalid percent encoding - keep the % *) 341 Buffer.add_char buf '%'; 342 true) 343 | Some c 344 when (not (Input.is_whitespace c)) && not (Input.is_flow_indicator c) -> 345 Buffer.add_char buf c; 346 ignore (Input.next t.input); 347 true 348 | _ -> false 349 do 350 () 351 done; 352 Buffer.contents buf 353 354(** Scan a tag *) 355let scan_tag t = 356 let start = Input.mark t.input in 357 ignore (Input.next t.input); 358 (* consume ! *) 359 let handle, suffix = 360 match Input.peek t.input with 361 | Some '<' -> 362 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *) 363 ignore (Input.next t.input); 364 let buf = Buffer.create 32 in 365 while 366 match Input.peek t.input with 367 | Some '>' -> false 368 | Some c -> 369 Buffer.add_char buf c; 370 ignore (Input.next t.input); 371 true 372 | None -> 373 Error.raise_at (Input.mark t.input) 374 (Invalid_tag "unclosed verbatim tag") 375 do 376 () 377 done; 378 ignore (Input.next t.input); 379 (* consume > *) 380 ("", Buffer.contents buf) 381 | Some c when Input.is_whitespace c || Input.is_flow_indicator c -> 382 (* Non-specific tag: ! *) 383 ("!", "") 384 | Some '!' -> 385 (* Secondary handle: !! *) 386 ignore (Input.next t.input); 387 (* consume second ! *) 388 let suffix = scan_tag_suffix t in 389 ("!!", suffix) 390 | _ -> ( 391 (* Primary handle or just suffix: !foo or !e!foo *) 392 (* Read alphanumeric characters *) 393 let buf = Buffer.create 16 in 394 while 395 match Input.peek t.input with 396 | Some c when Input.is_alnum c || c = '-' -> 397 Buffer.add_char buf c; 398 ignore (Input.next t.input); 399 true 400 | _ -> false 401 do 402 () 403 done; 404 (* Check if next character is ! - if so, this is a named handle *) 405 match Input.peek t.input with 406 | Some '!' -> 407 (* Named handle like !e! *) 408 ignore (Input.next t.input); 409 let handle_name = Buffer.contents buf in 410 let suffix = scan_tag_suffix t in 411 ("!" ^ handle_name ^ "!", suffix) 412 | _ -> 413 (* Just ! followed by suffix *) 414 ("!", Buffer.contents buf ^ scan_tag_suffix t)) 415 in 416 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *) 417 (match Input.peek t.input with 418 | None -> () (* EOF is ok *) 419 | Some c when Input.is_whitespace c || Input.is_break c -> () 420 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> () 421 | _ -> 422 Error.raise_at start 423 (Invalid_tag "expected whitespace or line break after tag")); 424 let span = Span.make ~start ~stop:(Input.mark t.input) in 425 (handle, suffix, span) 426 427(** Scan single-quoted scalar *) 428let scan_single_quoted t = 429 let start = Input.mark t.input in 430 ignore (Input.next t.input); 431 (* consume opening single-quote *) 432 let buf = Buffer.create 64 in 433 let whitespace = Buffer.create 16 in 434 (* Track trailing whitespace *) 435 436 let flush_whitespace () = 437 if Buffer.length whitespace > 0 then begin 438 Buffer.add_buffer buf whitespace; 439 Buffer.clear whitespace 440 end 441 in 442 443 let rec loop () = 444 match Input.peek t.input with 445 | None -> Error.raise_at start Unclosed_single_quote 446 | Some '\'' -> ( 447 ignore (Input.next t.input); 448 (* Check for escaped quote ('') *) 449 match Input.peek t.input with 450 | Some '\'' -> 451 flush_whitespace (); 452 Buffer.add_char buf '\''; 453 ignore (Input.next t.input); 454 loop () 455 | _ -> 456 (* End of string - flush any trailing whitespace *) 457 flush_whitespace ()) 458 | Some ' ' | Some '\t' -> 459 (* Track whitespace - don't add to buf yet *) 460 Buffer.add_char whitespace (Option.get (Input.peek t.input)); 461 ignore (Input.next t.input); 462 loop () 463 | Some '\n' | Some '\r' -> 464 (* Discard trailing whitespace before line break *) 465 Buffer.clear whitespace; 466 Input.consume_break t.input; 467 (* Skip leading whitespace on next line *) 468 while Input.next_is_blank t.input do 469 ignore (Input.next t.input) 470 done; 471 (* Check for document boundary *) 472 if Input.at_document_boundary t.input then 473 Error.raise_at start Unclosed_single_quote; 474 (* Check indentation: continuation must be > block indent (QB6E, DK95) *) 475 let col = column t in 476 let indent = current_indent t in 477 if 478 (not (Input.is_eof t.input)) 479 && (not (Input.next_is_break t.input)) 480 && col <= indent && indent >= 0 481 then 482 Error.raise_at (Input.mark t.input) 483 (Invalid_quoted_scalar_indentation 484 "invalid indentation in quoted scalar"); 485 (* Count empty lines (consecutive line breaks) *) 486 let empty_lines = ref 0 in 487 while Input.next_is_break t.input do 488 incr empty_lines; 489 Input.consume_break t.input; 490 while Input.next_is_blank t.input do 491 ignore (Input.next t.input) 492 done; 493 if Input.at_document_boundary t.input then 494 Error.raise_at start Unclosed_single_quote; 495 (* Check indentation after each empty line too *) 496 let col = column t in 497 let indent = current_indent t in 498 if 499 (not (Input.is_eof t.input)) 500 && (not (Input.next_is_break t.input)) 501 && col <= indent && indent >= 0 502 then 503 Error.raise_at (Input.mark t.input) 504 (Invalid_quoted_scalar_indentation 505 "invalid indentation in quoted scalar") 506 done; 507 (* Apply folding rules *) 508 if !empty_lines > 0 then begin 509 (* Empty lines: preserve as newlines *) 510 for _ = 1 to !empty_lines do 511 Buffer.add_char buf '\n' 512 done 513 end 514 else 515 (* Single break: fold to space (even at start of string) *) 516 Buffer.add_char buf ' '; 517 loop () 518 | Some c -> 519 flush_whitespace (); 520 Buffer.add_char buf c; 521 ignore (Input.next t.input); 522 loop () 523 in 524 loop (); 525 let span = Span.make ~start ~stop:(Input.mark t.input) in 526 (Buffer.contents buf, span) 527 528(** Decode hex escape of given length *) 529let decode_hex t len = 530 let start = Input.mark t.input in 531 let buf = Buffer.create len in 532 for _ = 1 to len do 533 match Input.peek t.input with 534 | Some c when Input.is_hex c -> 535 Buffer.add_char buf c; 536 ignore (Input.next t.input) 537 | _ -> Error.raise_at start (Invalid_hex_escape (Buffer.contents buf)) 538 done; 539 let code = int_of_string ("0x" ^ Buffer.contents buf) in 540 if code <= 0x7F then String.make 1 (Char.chr code) 541 else if code <= 0x7FF then 542 let b1 = 0xC0 lor (code lsr 6) in 543 let b2 = 0x80 lor (code land 0x3F) in 544 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2)) 545 else if code <= 0xFFFF then 546 let b1 = 0xE0 lor (code lsr 12) in 547 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in 548 let b3 = 0x80 lor (code land 0x3F) in 549 String.init 3 (fun i -> 550 Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3)) 551 else 552 let b1 = 0xF0 lor (code lsr 18) in 553 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in 554 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in 555 let b4 = 0x80 lor (code land 0x3F) in 556 String.init 4 (fun i -> 557 Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4)) 558 559(** Scan double-quoted scalar *) 560let scan_double_quoted t = 561 let start = Input.mark t.input in 562 ignore (Input.next t.input); 563 (* consume opening double-quote *) 564 let buf = Buffer.create 64 in 565 let whitespace = Buffer.create 16 in 566 (* Track pending whitespace *) 567 568 let flush_whitespace () = 569 if Buffer.length whitespace > 0 then begin 570 Buffer.add_buffer buf whitespace; 571 Buffer.clear whitespace 572 end 573 in 574 575 let rec loop () = 576 match Input.peek t.input with 577 | None -> Error.raise_at start Unclosed_double_quote 578 | Some '"' -> 579 (* Flush trailing whitespace before closing quote to preserve it *) 580 flush_whitespace (); 581 ignore (Input.next t.input) 582 | (Some ' ' | Some '\t') as c_opt -> 583 (* Track whitespace - don't add to buf yet *) 584 let c = match c_opt with Some c -> c | None -> assert false in 585 Buffer.add_char whitespace c; 586 ignore (Input.next t.input); 587 loop () 588 | Some '\\' -> 589 (* Escape sequence - this is non-whitespace content *) 590 flush_whitespace (); 591 (* Commit any pending whitespace *) 592 ignore (Input.next t.input); 593 (match Input.peek t.input with 594 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>") 595 | Some '0' -> 596 Buffer.add_char buf '\x00'; 597 ignore (Input.next t.input) 598 | Some 'a' -> 599 Buffer.add_char buf '\x07'; 600 ignore (Input.next t.input) 601 | Some 'b' -> 602 Buffer.add_char buf '\x08'; 603 ignore (Input.next t.input) 604 | Some 't' | Some '\t' -> 605 Buffer.add_char buf '\t'; 606 ignore (Input.next t.input) 607 | Some 'n' -> 608 Buffer.add_char buf '\n'; 609 ignore (Input.next t.input) 610 | Some 'v' -> 611 Buffer.add_char buf '\x0B'; 612 ignore (Input.next t.input) 613 | Some 'f' -> 614 Buffer.add_char buf '\x0C'; 615 ignore (Input.next t.input) 616 | Some 'r' -> 617 Buffer.add_char buf '\r'; 618 ignore (Input.next t.input) 619 | Some 'e' -> 620 Buffer.add_char buf '\x1B'; 621 ignore (Input.next t.input) 622 | Some ' ' -> 623 Buffer.add_char buf ' '; 624 ignore (Input.next t.input) 625 | Some '"' -> 626 Buffer.add_char buf '"'; 627 ignore (Input.next t.input) 628 | Some '/' -> 629 Buffer.add_char buf '/'; 630 ignore (Input.next t.input) 631 | Some '\\' -> 632 Buffer.add_char buf '\\'; 633 ignore (Input.next t.input) 634 | Some 'N' -> 635 Buffer.add_string buf "\xC2\x85"; 636 ignore (Input.next t.input) (* NEL *) 637 | Some '_' -> 638 Buffer.add_string buf "\xC2\xA0"; 639 ignore (Input.next t.input) (* NBSP *) 640 | Some 'L' -> 641 Buffer.add_string buf "\xE2\x80\xA8"; 642 ignore (Input.next t.input) (* LS *) 643 | Some 'P' -> 644 Buffer.add_string buf "\xE2\x80\xA9"; 645 ignore (Input.next t.input) (* PS *) 646 | Some 'x' -> 647 ignore (Input.next t.input); 648 Buffer.add_string buf (decode_hex t 2) 649 | Some 'u' -> 650 ignore (Input.next t.input); 651 Buffer.add_string buf (decode_hex t 4) 652 | Some 'U' -> 653 ignore (Input.next t.input); 654 Buffer.add_string buf (decode_hex t 8) 655 | Some '\n' | Some '\r' -> 656 (* Line continuation escape *) 657 Input.consume_break t.input; 658 while Input.next_is_blank t.input do 659 ignore (Input.next t.input) 660 done 661 | Some c -> 662 Error.raise_at (Input.mark t.input) 663 (Invalid_escape_sequence (Printf.sprintf "\\%c" c))); 664 loop () 665 | Some '\n' | Some '\r' -> 666 (* Line break: discard any pending trailing whitespace *) 667 Buffer.clear whitespace; 668 Input.consume_break t.input; 669 (* Count consecutive line breaks (empty lines) *) 670 let empty_lines = ref 0 in 671 let continue = ref true in 672 let started_with_tab = ref false in 673 while !continue do 674 (* Track if we start with a tab (for DK95/01 check) *) 675 if Input.next_is (( = ) '\t') t.input then started_with_tab := true; 676 (* Skip blanks (spaces/tabs) on the line *) 677 while Input.next_is_blank t.input do 678 ignore (Input.next t.input) 679 done; 680 (* Check if we hit another line break (empty line) *) 681 if Input.next_is_break t.input then begin 682 Input.consume_break t.input; 683 incr empty_lines; 684 started_with_tab := false (* Reset for next line *) 685 end 686 else continue := false 687 done; 688 (* Check for document boundary - this terminates the quoted string *) 689 if Input.at_document_boundary t.input then 690 Error.raise_at start Unclosed_double_quote; 691 (* Check indentation: continuation must be > block indent (QB6E, DK95) 692 Note: must be strictly greater than block indent, not just equal *) 693 let col = column t in 694 let indent = current_indent t in 695 let start_col = start.column in 696 (* DK95/01: if continuation started with tabs and column < start column, error *) 697 if (not (Input.is_eof t.input)) && !started_with_tab && col < start_col 698 then 699 Error.raise_at (Input.mark t.input) 700 (Invalid_quoted_scalar_indentation 701 "invalid indentation in quoted scalar"); 702 if (not (Input.is_eof t.input)) && col <= indent && indent >= 0 then 703 Error.raise_at (Input.mark t.input) 704 (Invalid_quoted_scalar_indentation 705 "invalid indentation in quoted scalar"); 706 (* Per YAML spec: single break = space, break + empty lines = newlines *) 707 if !empty_lines > 0 then begin 708 (* Empty lines: output N newlines where N = number of empty lines *) 709 for _ = 1 to !empty_lines do 710 Buffer.add_char buf '\n' 711 done 712 end 713 else 714 (* Single break folds to space *) 715 Buffer.add_char buf ' '; 716 loop () 717 | Some c -> 718 (* Non-whitespace character *) 719 flush_whitespace (); 720 (* Commit any pending whitespace *) 721 Buffer.add_char buf c; 722 ignore (Input.next t.input); 723 loop () 724 in 725 loop (); 726 let span = Span.make ~start ~stop:(Input.mark t.input) in 727 (Buffer.contents buf, span) 728 729(** Check if character can appear in plain scalar at this position *) 730let can_continue_plain t c ~in_flow = 731 match c with 732 | ':' -> ( 733 (* : is OK if not followed by whitespace or flow indicator *) 734 match Input.peek_nth t.input 1 with 735 | None -> true 736 | Some c2 when Input.is_whitespace c2 -> false 737 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false 738 | _ -> true) 739 | '#' -> ( 740 (* # is a comment indicator only if preceded by whitespace *) 741 (* Check the previous character to determine if this is a comment *) 742 match Input.peek_back t.input with 743 | None -> true (* At start - can't be comment indicator, allow it *) 744 | Some c when Input.is_whitespace c -> 745 false (* Preceded by whitespace - comment *) 746 | Some c when Input.is_break c -> false (* At start of line - comment *) 747 | _ -> true (* Not preceded by whitespace - part of scalar *)) 748 | c when in_flow && Input.is_flow_indicator c -> false 749 | _ when Input.is_break c -> false 750 | _ -> true 751 752(** Scan plain scalar *) 753let scan_plain_scalar t = 754 let start = Input.mark t.input in 755 let in_flow = t.flow_level > 0 in 756 let indent = current_indent t in 757 (* In flow context, scalars must be indented more than the current block indent. 758 This ensures that content at block indent or less ends the flow context. *) 759 if in_flow && column t - 1 < indent then 760 Error.raise_at start Invalid_flow_indentation; 761 let buf = Buffer.create 64 in 762 let spaces = Buffer.create 16 in 763 let whitespace = Buffer.create 16 in 764 (* Track whitespace within a line *) 765 let leading_blanks = ref false in 766 767 let rec scan_line () = 768 match Input.peek t.input with 769 | None -> () 770 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow -> 771 (* Blank character within a line - save to whitespace buffer *) 772 Buffer.add_char whitespace c; 773 ignore (Input.next t.input); 774 scan_line () 775 | Some c when can_continue_plain t c ~in_flow -> 776 (* Non-blank character - process any pending breaks/whitespace first *) 777 begin 778 if Buffer.length spaces > 0 then begin 779 if !leading_blanks then begin 780 (* Fold line break *) 781 if Buffer.contents spaces = "\n" then Buffer.add_char buf ' ' 782 else begin 783 (* Multiple breaks - preserve all but first *) 784 let s = Buffer.contents spaces in 785 Buffer.add_substring buf s 1 (String.length s - 1) 786 end 787 end 788 else Buffer.add_buffer buf spaces; 789 Buffer.clear spaces 790 end; 791 (* Add any pending whitespace from within the line *) 792 if Buffer.length whitespace > 0 then begin 793 Buffer.add_buffer buf whitespace; 794 Buffer.clear whitespace 795 end; 796 (* Add the character *) 797 Buffer.add_char buf c; 798 ignore (Input.next t.input); 799 leading_blanks := false; 800 scan_line () 801 end 802 | _ -> () 803 in 804 805 let rec scan_lines () = 806 scan_line (); 807 (* Check for line continuation *) 808 if Input.next_is_break t.input then begin 809 (* Discard any trailing whitespace from the current line *) 810 Buffer.clear whitespace; 811 (* Save the line break *) 812 if !leading_blanks then begin 813 (* We already had a break - this is an additional break (empty line) *) 814 Buffer.add_char spaces '\n' 815 end 816 else begin 817 (* First line break *) 818 Buffer.clear spaces; 819 Buffer.add_char spaces '\n'; 820 leading_blanks := true 821 end; 822 Input.consume_break t.input; 823 (* Note: We do NOT set allow_simple_key here during plain scalar scanning. 824 Setting it here would incorrectly allow ':' that appears on a continuation 825 line to become a mapping indicator. The flag will be set properly after 826 the scalar ends and skip_to_next_token processes line breaks. *) 827 (* Skip leading blanks on the next line *) 828 while Input.next_is_blank t.input do 829 ignore (Input.next t.input) 830 done; 831 let col = (Input.position t.input).column in 832 (* Check indentation - stop if we're at or before the containing block's indent *) 833 (* However, allow empty lines (line breaks) to continue even if dedented *) 834 if Input.next_is_break t.input then 835 scan_lines () (* Empty line - continue *) 836 else if (not in_flow) && col <= indent then () 837 (* Stop - dedented or at parent level in block context *) 838 else if Input.at_document_boundary t.input then () 839 (* Stop - document boundary *) 840 else scan_lines () 841 end 842 in 843 844 scan_lines (); 845 let value = Buffer.contents buf in 846 (* Trim trailing whitespace (spaces and tabs) *) 847 let value = 848 let len = String.length value in 849 let rec find_end i = 850 if i < 0 then 0 851 else match value.[i] with ' ' | '\t' -> find_end (i - 1) | _ -> i + 1 852 in 853 let end_pos = find_end (len - 1) in 854 String.sub value 0 end_pos 855 in 856 let span = Span.make ~start ~stop:(Input.mark t.input) in 857 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *) 858 (value, span, !leading_blanks) 859 860(** Scan block scalar (literal | or folded >) *) 861let scan_block_scalar t literal = 862 let start = Input.mark t.input in 863 ignore (Input.next t.input); 864 865 (* consume | or > *) 866 867 (* Parse header: optional indentation indicator and chomping *) 868 let explicit_indent = ref None in 869 let chomping = ref Chomping.Clip in 870 871 (* First character of header *) 872 (match Input.peek t.input with 873 | Some c when Input.is_digit c && c <> '0' -> 874 explicit_indent := Some (Char.code c - Char.code '0'); 875 ignore (Input.next t.input) 876 | Some '-' -> 877 chomping := Chomping.Strip; 878 ignore (Input.next t.input) 879 | Some '+' -> 880 chomping := Chomping.Keep; 881 ignore (Input.next t.input) 882 | _ -> ()); 883 884 (* Second character of header *) 885 (match Input.peek t.input with 886 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None -> 887 explicit_indent := Some (Char.code c - Char.code '0'); 888 ignore (Input.next t.input) 889 | Some '-' when !chomping = Chomping.Clip -> 890 chomping := Chomping.Strip; 891 ignore (Input.next t.input) 892 | Some '+' when !chomping = Chomping.Clip -> 893 chomping := Chomping.Keep; 894 ignore (Input.next t.input) 895 | _ -> ()); 896 897 (* Skip whitespace and optional comment *) 898 skip_whitespace_and_comment t; 899 900 (* Consume line break *) 901 if Input.next_is_break t.input then Input.consume_break t.input 902 else if not (Input.is_eof t.input) then 903 Error.raise_at (Input.mark t.input) 904 (Invalid_block_scalar_header "expected newline after header"); 905 906 let base_indent = current_indent t in 907 (* base_indent is the indent level from the stack, -1 if empty. 908 It's used directly for comparisons in implicit indent case. *) 909 let content_indent = 910 ref 911 (match !explicit_indent with 912 | Some n -> 913 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed. 914 content_indent = (base_indent - 1) + n, but at least n for document level. *) 915 let base_level = max 0 (base_indent - 1) in 916 base_level + n 917 | None -> 0 (* Will be determined by first non-empty line *)) 918 in 919 920 let buf = Buffer.create 256 in 921 let trailing_breaks = Buffer.create 16 in 922 let leading_blank = ref false in 923 (* Was the previous line "more indented"? *) 924 let max_empty_line_indent = ref 0 in 925 (* Track max indent of empty lines before first content *) 926 927 (* Skip to content indentation, skipping empty lines. 928 Returns the number of spaces actually skipped (important for detecting dedentation). *) 929 let rec skip_to_content_indent () = 930 if !content_indent > 0 then begin 931 (* Explicit indent - skip up to content_indent spaces *) 932 let spaces_skipped = ref 0 in 933 while 934 !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input 935 do 936 incr spaces_skipped; 937 ignore (Input.next t.input) 938 done; 939 940 (* Check if this line is empty (only spaces/tabs until break/eof) *) 941 if Input.next_is_break t.input then begin 942 (* Empty line - record the break and continue *) 943 Buffer.add_char trailing_breaks '\n'; 944 Input.consume_break t.input; 945 skip_to_content_indent () 946 end 947 else if !spaces_skipped < !content_indent then begin 948 (* Line starts with fewer spaces than content_indent - dedented *) 949 !spaces_skipped 950 end 951 else if Input.next_is_blank t.input then begin 952 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line. 953 For literal scalars, whitespace-only lines ARE content (not empty). 954 For folded scalars, whitespace-only lines that are "more indented" are preserved. *) 955 if literal then 956 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *) 957 !content_indent 958 else begin 959 (* Folded: check if rest is only blanks *) 960 let idx = ref 0 in 961 while 962 match Input.peek_nth t.input !idx with 963 | Some c when Input.is_blank c -> 964 incr idx; 965 true 966 | _ -> false 967 do 968 () 969 done; 970 match Input.peek_nth t.input !idx with 971 | None | Some '\n' | Some '\r' -> 972 (* Empty/whitespace-only line in folded - skip spaces *) 973 while Input.next_is_blank t.input do 974 ignore (Input.next t.input) 975 done; 976 Buffer.add_char trailing_breaks '\n'; 977 Input.consume_break t.input; 978 skip_to_content_indent () 979 | _ -> 980 (* Has non-whitespace content *) 981 !content_indent 982 end 983 end 984 else !content_indent 985 end 986 else begin 987 (* Implicit indent - skip empty lines without consuming spaces. 988 Note: Only SPACES count as indentation. Tabs are content, not indentation. 989 So we only check for spaces when determining if a line is "empty". *) 990 if Input.next_is_break t.input then begin 991 Buffer.add_char trailing_breaks '\n'; 992 Input.consume_break t.input; 993 skip_to_content_indent () 994 end 995 else if Input.next_is (( = ) ' ') t.input then begin 996 (* Check if line is empty (only spaces before break) *) 997 let idx = ref 0 in 998 while 999 match Input.peek_nth t.input !idx with 1000 | Some ' ' -> 1001 incr idx; 1002 true 1003 | _ -> false 1004 do 1005 () 1006 done; 1007 match Input.peek_nth t.input !idx with 1008 | None | Some '\n' | Some '\r' -> 1009 (* Line has only spaces - empty line *) 1010 (* Track max indent of empty lines for later validation *) 1011 if !idx > !max_empty_line_indent then max_empty_line_indent := !idx; 1012 while Input.next_is (( = ) ' ') t.input do 1013 ignore (Input.next t.input) 1014 done; 1015 Buffer.add_char trailing_breaks '\n'; 1016 Input.consume_break t.input; 1017 skip_to_content_indent () 1018 | _ -> 1019 (* Has content (including tabs which are content, not indentation) *) 1020 0 1021 end 1022 else if Input.next_is (( = ) '\t') t.input then begin 1023 (* Tab at start of line in implicit indent mode - this is an error (Y79Y) 1024 because tabs cannot be used as indentation in YAML *) 1025 Error.raise_at (Input.mark t.input) Tab_in_indentation 1026 end 1027 else 1028 (* Not at break or space - other content character *) 1029 0 1030 end 1031 in 1032 1033 (* Read content *) 1034 let rec read_lines () = 1035 let spaces_skipped = skip_to_content_indent () in 1036 1037 (* Check if we're at content *) 1038 if Input.is_eof t.input then () 1039 else if Input.at_document_boundary t.input then () 1040 else begin 1041 (* Count additional leading spaces beyond what was skipped *) 1042 let extra_spaces = ref 0 in 1043 while Input.next_is (( = ) ' ') t.input do 1044 incr extra_spaces; 1045 ignore (Input.next t.input) 1046 done; 1047 1048 (* Calculate actual line indentation *) 1049 let line_indent = spaces_skipped + !extra_spaces in 1050 1051 (* Determine content indent from first content line (implicit case) *) 1052 let first_line = !content_indent = 0 in 1053 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent. 1054 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *) 1055 let base_level = base_indent - 1 in 1056 let should_process = 1057 if !content_indent = 0 then begin 1058 (* For implicit indent, content must be more indented than base_level. *) 1059 if line_indent <= base_level then false 1060 (* No content - first line not indented enough *) 1061 else begin 1062 (* Validate: first content line must be indented at least as much as 1063 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *) 1064 if line_indent < !max_empty_line_indent && line_indent > base_level 1065 then 1066 Error.raise_at (Input.mark t.input) 1067 (Invalid_block_scalar_header 1068 "wrongly indented line in block scalar"); 1069 content_indent := line_indent; 1070 true 1071 end 1072 end 1073 else if line_indent < !content_indent then false 1074 (* Dedented - done with content *) 1075 else true 1076 in 1077 1078 if should_process then begin 1079 (* Check if current line is "more indented" (has extra indent or starts with whitespace). 1080 For folded scalars, lines that start with any whitespace (space or tab) after the 1081 content indentation are "more indented" and preserve breaks. 1082 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *) 1083 let trailing_blank = 1084 line_indent > !content_indent || Input.next_is_blank t.input 1085 in 1086 1087 (* Add trailing breaks to buffer *) 1088 if Buffer.length buf > 0 then begin 1089 if Buffer.length trailing_breaks > 0 then begin 1090 if literal then Buffer.add_buffer buf trailing_breaks 1091 else begin 1092 (* Folded scalar: fold only if both previous and current lines are not more-indented *) 1093 if (not !leading_blank) && not trailing_blank then begin 1094 let breaks = Buffer.contents trailing_breaks in 1095 if String.length breaks = 1 then Buffer.add_char buf ' ' 1096 else Buffer.add_substring buf breaks 1 (String.length breaks - 1) 1097 end 1098 else begin 1099 (* Preserve breaks for more-indented lines *) 1100 Buffer.add_buffer buf trailing_breaks 1101 end 1102 end 1103 end 1104 else if not literal then Buffer.add_char buf ' ' 1105 end 1106 else Buffer.add_buffer buf trailing_breaks; 1107 Buffer.clear trailing_breaks; 1108 1109 (* Add extra indentation for literal or more-indented folded lines *) 1110 (* On the first line (when determining content_indent), we've already consumed all spaces, 1111 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *) 1112 if (not first_line) && (literal || (!extra_spaces > 0 && not literal)) 1113 then begin 1114 for _ = 1 to !extra_spaces do 1115 Buffer.add_char buf ' ' 1116 done 1117 end; 1118 1119 (* Read line content *) 1120 while 1121 (not (Input.is_eof t.input)) && not (Input.next_is_break t.input) 1122 do 1123 Buffer.add_char buf (Input.next_exn t.input) 1124 done; 1125 1126 (* Record trailing break *) 1127 if Input.next_is_break t.input then begin 1128 Buffer.add_char trailing_breaks '\n'; 1129 Input.consume_break t.input 1130 end; 1131 1132 (* Update leading_blank for next iteration *) 1133 leading_blank := trailing_blank; 1134 1135 read_lines () 1136 end 1137 end 1138 in 1139 1140 read_lines (); 1141 1142 (* Apply chomping *) 1143 let value = 1144 let content = Buffer.contents buf in 1145 match !chomping with 1146 | Chomping.Strip -> content 1147 | Chomping.Clip -> 1148 if String.length content > 0 then content ^ "\n" else content 1149 | Chomping.Keep -> content ^ Buffer.contents trailing_breaks 1150 in 1151 1152 let span = Span.make ~start ~stop:(Input.mark t.input) in 1153 let style = if literal then `Literal else `Folded in 1154 (value, style, span) 1155 1156(** Scan directive (after %) *) 1157let scan_directive t = 1158 let start = Input.mark t.input in 1159 ignore (Input.next t.input); 1160 1161 (* consume % *) 1162 1163 (* Read directive name *) 1164 let name_buf = Buffer.create 16 in 1165 while 1166 match Input.peek t.input with 1167 | Some c when Input.is_alnum c || c = '-' -> 1168 Buffer.add_char name_buf c; 1169 ignore (Input.next t.input); 1170 true 1171 | _ -> false 1172 do 1173 () 1174 done; 1175 let name = Buffer.contents name_buf in 1176 1177 (* Skip blanks *) 1178 while Input.next_is_blank t.input do 1179 ignore (Input.next t.input) 1180 done; 1181 1182 match name with 1183 | "YAML" -> 1184 (* Version directive: %YAML 1.2 *) 1185 let major = ref 0 in 1186 let minor = ref 0 in 1187 (* Read major version *) 1188 while Input.next_is_digit t.input do 1189 major := 1190 (!major * 10) + (Char.code (Input.next_exn t.input) - Char.code '0') 1191 done; 1192 (* Expect . *) 1193 (match Input.peek t.input with 1194 | Some '.' -> ignore (Input.next t.input) 1195 | _ -> 1196 Error.raise_at (Input.mark t.input) 1197 (Invalid_yaml_version "expected '.'")); 1198 (* Read minor version *) 1199 while Input.next_is_digit t.input do 1200 minor := 1201 (!minor * 10) + (Char.code (Input.next_exn t.input) - Char.code '0') 1202 done; 1203 (* Validate: only whitespace and comments allowed before line break (MUS6) *) 1204 skip_whitespace_and_comment t; 1205 if (not (Input.next_is_break t.input)) && not (Input.is_eof t.input) then 1206 Error.raise_at (Input.mark t.input) 1207 (Invalid_directive "expected comment or line break after version"); 1208 let span = Span.make ~start ~stop:(Input.mark t.input) in 1209 (Token.Version_directive { major = !major; minor = !minor }, span) 1210 | "TAG" -> 1211 (* Tag directive: %TAG !foo! tag:example.com,2000: *) 1212 let handle = scan_tag_handle t in 1213 (* Skip blanks *) 1214 while Input.next_is_blank t.input do 1215 ignore (Input.next t.input) 1216 done; 1217 (* Read prefix *) 1218 let prefix_buf = Buffer.create 32 in 1219 while 1220 match Input.peek t.input with 1221 | Some c when not (Input.is_whitespace c) -> 1222 Buffer.add_char prefix_buf c; 1223 ignore (Input.next t.input); 1224 true 1225 | _ -> false 1226 do 1227 () 1228 done; 1229 let prefix = Buffer.contents prefix_buf in 1230 let span = Span.make ~start ~stop:(Input.mark t.input) in 1231 (Token.Tag_directive { handle; prefix }, span) 1232 | _ -> 1233 (* Reserved/Unknown directive - skip to end of line and ignore *) 1234 (* Per YAML spec, reserved directives should be ignored with a warning *) 1235 while (not (Input.is_eof t.input)) && not (Input.next_is_break t.input) do 1236 ignore (Input.next t.input) 1237 done; 1238 let span = Span.make ~start ~stop:(Input.mark t.input) in 1239 (* Return an empty tag directive token to indicate directive was processed but ignored *) 1240 (Token.Tag_directive { handle = ""; prefix = "" }, span) 1241 1242(** Fetch the next token(s) into the queue *) 1243let rec fetch_next_token t = 1244 skip_to_next_token t; 1245 stale_simple_keys t; 1246 let col = column t in 1247 (* Unroll indents that are deeper than current column. 1248 Note: we use col, not col-1, to allow entries at the same level. *) 1249 unroll_indent t col; 1250 1251 (* We're about to process actual content, not leading whitespace *) 1252 t.leading_whitespace <- false; 1253 1254 if Input.is_eof t.input then fetch_stream_end t 1255 else if Input.at_document_boundary t.input then fetch_document_indicator t 1256 else begin 1257 match Input.peek t.input with 1258 | None -> fetch_stream_end t 1259 | Some '%' when (Input.position t.input).column = 1 -> fetch_directive t 1260 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start 1261 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start 1262 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end 1263 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end 1264 | Some ',' -> fetch_flow_entry t 1265 | Some '-' when t.flow_level = 0 && check_block_entry t -> 1266 fetch_block_entry t 1267 | Some '?' when check_key t -> fetch_key t 1268 | Some ':' when check_value t -> fetch_value t 1269 | Some '*' -> fetch_alias t 1270 | Some '&' -> fetch_anchor t 1271 | Some '!' -> fetch_tag t 1272 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true 1273 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false 1274 | Some '\'' -> fetch_single_quoted t 1275 | Some '"' -> fetch_double_quoted t 1276 | Some '-' when can_start_plain t -> fetch_plain_scalar t 1277 | Some '?' when can_start_plain t -> fetch_plain_scalar t 1278 | Some ':' when can_start_plain t -> fetch_plain_scalar t 1279 | Some c when can_start_plain_char c t -> fetch_plain_scalar t 1280 | Some c -> Error.raise_at (Input.mark t.input) (Unexpected_character c) 1281 end 1282 1283and fetch_stream_end t = 1284 if not t.stream_ended then begin 1285 unroll_indent t (-1); 1286 remove_simple_key t; 1287 t.allow_simple_key <- false; 1288 t.stream_ended <- true; 1289 let span = Span.point (Input.mark t.input) in 1290 emit t span Token.Stream_end 1291 end 1292 1293and fetch_document_indicator t = 1294 unroll_indent t (-1); 1295 remove_simple_key t; 1296 t.allow_simple_key <- false; 1297 let start = Input.mark t.input in 1298 let indicator = Input.peek_string t.input 3 in 1299 Input.skip t.input 3; 1300 let span = Span.make ~start ~stop:(Input.mark t.input) in 1301 let token = 1302 if indicator = "---" then Token.Document_start else Token.Document_end 1303 in 1304 (* Reset document content flag after document end marker *) 1305 if indicator = "..." then begin 1306 t.document_has_content <- false; 1307 (* After document end marker, skip whitespace and check for end of line or comment *) 1308 while Input.next_is_blank t.input do 1309 ignore (Input.next t.input) 1310 done; 1311 match Input.peek t.input with 1312 | None -> () (* EOF is ok *) 1313 | Some c when Input.is_break c -> () 1314 | Some '#' -> () (* Comment is ok *) 1315 | _ -> 1316 Error.raise_at start 1317 (Invalid_directive 1318 "content not allowed after document end marker on same line") 1319 end; 1320 emit t span token 1321 1322and fetch_directive t = 1323 (* Directives can only appear: 1324 1. At stream start (before any document content) 1325 2. After a document end marker (...) 1326 If we've emitted content in the current document, we need a document end marker first *) 1327 if t.document_has_content then 1328 Error.raise_at (Input.mark t.input) 1329 (Unexpected_token 1330 "directives must be separated from document content by document end \ 1331 marker (...)"); 1332 unroll_indent t (-1); 1333 remove_simple_key t; 1334 t.allow_simple_key <- false; 1335 let token, span = scan_directive t in 1336 emit t span token 1337 1338and fetch_flow_collection_start t token_type = 1339 save_simple_key t; 1340 (* Record indent of outermost flow collection *) 1341 if t.flow_level = 0 then t.flow_indent <- column t; 1342 t.flow_level <- t.flow_level + 1; 1343 (* Track whether this is a mapping or sequence *) 1344 let is_mapping = token_type = Token.Flow_mapping_start in 1345 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack; 1346 t.allow_simple_key <- true; 1347 t.simple_keys <- None :: t.simple_keys; 1348 t.document_has_content <- true; 1349 let start = Input.mark t.input in 1350 ignore (Input.next t.input); 1351 let span = Span.make ~start ~stop:(Input.mark t.input) in 1352 emit t span token_type 1353 1354and fetch_flow_collection_end t token_type = 1355 remove_simple_key t; 1356 t.flow_level <- t.flow_level - 1; 1357 t.flow_mapping_stack <- 1358 (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []); 1359 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []); 1360 t.allow_simple_key <- false; 1361 let start = Input.mark t.input in 1362 ignore (Input.next t.input); 1363 (* Allow adjacent values after flow collection ends *) 1364 if t.flow_level > 0 then 1365 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1366 let span = Span.make ~start ~stop:(Input.mark t.input) in 1367 emit t span token_type 1368 1369and fetch_flow_entry t = 1370 remove_simple_key t; 1371 t.allow_simple_key <- true; 1372 let start = Input.mark t.input in 1373 ignore (Input.next t.input); 1374 let span = Span.make ~start ~stop:(Input.mark t.input) in 1375 emit t span Token.Flow_entry 1376 1377and check_block_entry t = 1378 (* - followed by whitespace or EOF *) 1379 match Input.peek_nth t.input 1 with 1380 | None -> true 1381 | Some c -> Input.is_whitespace c 1382 1383and fetch_block_entry t = 1384 if t.flow_level = 0 then begin 1385 (* Block entries require allow_simple_key to be true. 1386 This prevents block sequences on the same line as a mapping value, 1387 e.g., "key: - a" is invalid. *) 1388 if not t.allow_simple_key then 1389 Error.raise_at (Input.mark t.input) Block_sequence_disallowed; 1390 let col = column t in 1391 if roll_indent t col then begin 1392 let span = Span.point (Input.mark t.input) in 1393 emit t span Token.Block_sequence_start 1394 end 1395 end; 1396 remove_simple_key t; 1397 t.allow_simple_key <- true; 1398 t.document_has_content <- true; 1399 let start = Input.mark t.input in 1400 ignore (Input.next t.input); 1401 1402 (* Check for tabs after - : pattern like -\t- is invalid *) 1403 let found_tabs, _found_spaces = skip_blanks_check_tabs t in 1404 if found_tabs then begin 1405 (* If we found tabs and next char is - followed by whitespace, error *) 1406 match Input.peek t.input with 1407 | Some '-' -> ( 1408 match Input.peek_nth t.input 1 with 1409 | None -> Error.raise_at start Tab_in_indentation 1410 | Some c when Input.is_whitespace c -> 1411 Error.raise_at start Tab_in_indentation 1412 | Some _ -> ()) 1413 | _ -> () 1414 end; 1415 1416 let span = Span.make ~start ~stop:(Input.mark t.input) in 1417 emit t span Token.Block_entry 1418 1419and check_key t = 1420 (* ? followed by whitespace or flow indicator in both block and flow *) 1421 match Input.peek_nth t.input 1 with 1422 | None -> true 1423 | Some c -> 1424 Input.is_whitespace c || (t.flow_level > 0 && Input.is_flow_indicator c) 1425 1426and fetch_key t = 1427 if t.flow_level = 0 then begin 1428 if not t.allow_simple_key then 1429 Error.raise_at (Input.mark t.input) Expected_key; 1430 let col = column t in 1431 if roll_indent t col then begin 1432 let span = Span.point (Input.mark t.input) in 1433 emit t span Token.Block_mapping_start 1434 end 1435 end; 1436 remove_simple_key t; 1437 t.allow_simple_key <- t.flow_level = 0; 1438 t.document_has_content <- true; 1439 let start = Input.mark t.input in 1440 ignore (Input.next t.input); 1441 1442 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *) 1443 let found_tabs, _found_spaces = skip_blanks_check_tabs t in 1444 if found_tabs && t.flow_level = 0 then begin 1445 (* In block context, tabs after ? are not allowed *) 1446 Error.raise_at start Tab_in_indentation 1447 end; 1448 1449 let span = Span.make ~start ~stop:(Input.mark t.input) in 1450 emit t span Token.Key 1451 1452and check_value t = 1453 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *) 1454 match Input.peek_nth t.input 1 with 1455 | None -> true 1456 | Some c -> ( 1457 Input.is_whitespace c 1458 || (t.flow_level > 0 && Input.is_flow_indicator c) 1459 || 1460 (* Allow adjacent values in flow context at designated positions *) 1461 t.flow_level > 0 1462 && 1463 match t.adjacent_value_allowed_at with 1464 | Some pos -> 1465 pos.Position.line = (Input.position t.input).Position.line 1466 && pos.Position.column = (Input.position t.input).Position.column 1467 | None -> false) 1468 1469and fetch_value t = 1470 let start = Input.mark t.input in 1471 (* Check for simple key *) 1472 let used_simple_key = 1473 match t.simple_keys with 1474 | Some sk :: _ when sk.sk_possible -> 1475 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line. 1476 In explicit flow mapping { }, key and : can span lines. *) 1477 let is_implicit_flow_mapping = 1478 match t.flow_mapping_stack with 1479 | false :: _ -> 1480 true (* false = we're in a sequence, so any mapping is implicit *) 1481 | _ -> false 1482 in 1483 if 1484 is_implicit_flow_mapping 1485 && sk.sk_position.line < (Input.position t.input).line 1486 then Error.raise_at start Illegal_flow_key_line; 1487 (* Insert KEY token before the simple key value *) 1488 let key_span = Span.point sk.sk_position in 1489 let key_token = { Token.token = Token.Key; span = key_span } in 1490 (* We need to insert at the right position *) 1491 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1492 Queue.clear t.tokens; 1493 let insert_pos = sk.sk_token_number - t.tokens_taken in 1494 Array.iteri 1495 (fun i tok -> 1496 if i = insert_pos then Queue.add key_token t.tokens; 1497 Queue.add tok t.tokens) 1498 tokens; 1499 if insert_pos >= Array.length tokens then Queue.add key_token t.tokens; 1500 t.token_number <- t.token_number + 1; 1501 (* Roll indent for implicit block mapping *) 1502 if t.flow_level = 0 then begin 1503 let col = sk.sk_position.column in 1504 if roll_indent t col then begin 1505 let span = Span.point sk.sk_position in 1506 (* Insert block mapping start before key *) 1507 let bm_token = { Token.token = Token.Block_mapping_start; span } in 1508 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1509 Queue.clear t.tokens; 1510 Array.iteri 1511 (fun i tok -> 1512 if i = insert_pos then Queue.add bm_token t.tokens; 1513 Queue.add tok t.tokens) 1514 tokens; 1515 if insert_pos >= Array.length tokens then 1516 Queue.add bm_token t.tokens; 1517 t.token_number <- t.token_number + 1 1518 end 1519 end; 1520 t.simple_keys <- None :: List.tl t.simple_keys; 1521 true 1522 | _ -> 1523 (* No simple key - this is a complex value (or empty key) *) 1524 if t.flow_level = 0 then begin 1525 if not t.allow_simple_key then 1526 Error.raise_at (Input.mark t.input) Expected_key; 1527 let col = column t in 1528 if roll_indent t col then begin 1529 let span = Span.point (Input.mark t.input) in 1530 emit t span Token.Block_mapping_start 1531 end 1532 (* Note: We don't emit KEY here. Empty key handling is done by the parser, 1533 which emits empty scalar when it sees VALUE without preceding KEY. *) 1534 end; 1535 false 1536 in 1537 remove_simple_key t; 1538 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white" 1539 In flow context or after using a simple key, disallow simple keys *) 1540 t.allow_simple_key <- (not used_simple_key) && t.flow_level = 0; 1541 t.document_has_content <- true; 1542 let start = Input.mark t.input in 1543 ignore (Input.next t.input); 1544 1545 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09) 1546 However, :\t bar (tab followed by space then content) is valid (6BCT) *) 1547 let found_tabs, found_spaces = skip_blanks_check_tabs t in 1548 if found_tabs && (not found_spaces) && t.flow_level = 0 then begin 1549 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *) 1550 match Input.peek t.input with 1551 | Some ('-' | '?') -> Error.raise_at start Tab_in_indentation 1552 | Some c 1553 when (c >= 'a' && c <= 'z') 1554 || (c >= 'A' && c <= 'Z') 1555 || (c >= '0' && c <= '9') -> 1556 (* Tab-only followed by alphanumeric - likely a key, which is invalid *) 1557 Error.raise_at start Tab_in_indentation 1558 | _ -> () 1559 end; 1560 1561 (* Skip any comment that may follow the colon and whitespace *) 1562 skip_whitespace_and_comment t; 1563 1564 let span = Span.make ~start ~stop:(Input.mark t.input) in 1565 emit t span Token.Value 1566 1567and fetch_anchor_or_alias t ~is_alias = 1568 save_simple_key t; 1569 t.allow_simple_key <- false; 1570 t.document_has_content <- true; 1571 let start = Input.mark t.input in 1572 ignore (Input.next t.input); 1573 (* consume * or & *) 1574 let name, span = scan_anchor_alias t in 1575 let span = Span.make ~start ~stop:span.stop in 1576 let token = if is_alias then Token.Alias name else Token.Anchor name in 1577 emit t span token 1578 1579and fetch_alias t = fetch_anchor_or_alias t ~is_alias:true 1580and fetch_anchor t = fetch_anchor_or_alias t ~is_alias:false 1581 1582and fetch_tag t = 1583 save_simple_key t; 1584 t.allow_simple_key <- false; 1585 t.document_has_content <- true; 1586 let handle, suffix, span = scan_tag t in 1587 emit t span (Token.Tag { handle; suffix }) 1588 1589and fetch_block_scalar t literal = 1590 remove_simple_key t; 1591 t.allow_simple_key <- true; 1592 t.document_has_content <- true; 1593 let value, style, span = scan_block_scalar t literal in 1594 emit t span (Token.Scalar { style; value }) 1595 1596and fetch_quoted t ~double = 1597 save_simple_key t; 1598 t.allow_simple_key <- false; 1599 t.document_has_content <- true; 1600 let value, span = 1601 if double then scan_double_quoted t else scan_single_quoted t 1602 in 1603 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *) 1604 skip_to_next_token t; 1605 if t.flow_level > 0 then 1606 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1607 let style = if double then `Double_quoted else `Single_quoted in 1608 emit t span (Token.Scalar { style; value }) 1609 1610and fetch_single_quoted t = fetch_quoted t ~double:false 1611and fetch_double_quoted t = fetch_quoted t ~double:true 1612 1613and can_start_plain t = 1614 (* Check if - ? : can start a plain scalar *) 1615 match Input.peek_nth t.input 1 with 1616 | None -> false 1617 | Some c -> 1618 (not (Input.is_whitespace c)) 1619 && (t.flow_level = 0 || not (Input.is_flow_indicator c)) 1620 1621and can_start_plain_char c _t = 1622 (* Characters that can start a plain scalar *) 1623 if Input.is_whitespace c then false 1624 else if Input.is_indicator c then false 1625 else true 1626 1627and fetch_plain_scalar t = 1628 save_simple_key t; 1629 t.allow_simple_key <- false; 1630 t.document_has_content <- true; 1631 let value, span, ended_with_linebreak = scan_plain_scalar t in 1632 (* If the plain scalar ended after crossing a line break (leading_blanks = true), 1633 allow simple keys. This is important because the scanner already consumed the 1634 line break and leading whitespace when checking for continuation. *) 1635 if ended_with_linebreak then t.allow_simple_key <- true; 1636 emit t span (Token.Scalar { style = `Plain; value }) 1637 1638(** Check if we need more tokens to resolve simple keys *) 1639let need_more_tokens t = 1640 if t.stream_ended then false 1641 else if Queue.is_empty t.tokens then true 1642 else 1643 (* Check if any simple key could affect the first queued token *) 1644 List.exists 1645 (function 1646 | Some sk when sk.sk_possible -> sk.sk_token_number >= t.tokens_taken 1647 | _ -> false) 1648 t.simple_keys 1649 1650(** Ensure we have enough tokens to return one safely *) 1651let ensure_tokens t = 1652 if not t.stream_started then begin 1653 t.stream_started <- true; 1654 let span = Span.point (Input.position t.input) in 1655 let encoding, _ = Encoding.detect (Input.source t.input) in 1656 emit t span (Token.Stream_start encoding) 1657 end; 1658 while need_more_tokens t do 1659 fetch_next_token t 1660 done 1661 1662(** Get next token *) 1663let next t = 1664 ensure_tokens t; 1665 if Queue.is_empty t.tokens then None 1666 else begin 1667 t.tokens_taken <- t.tokens_taken + 1; 1668 Some (Queue.pop t.tokens) 1669 end 1670 1671(** Peek at next token *) 1672let peek t = 1673 ensure_tokens t; 1674 Queue.peek_opt t.tokens 1675 1676(** Iterate over all tokens *) 1677let iter f t = 1678 let rec loop () = 1679 match next t with 1680 | None -> () 1681 | Some tok -> 1682 f tok; 1683 loop () 1684 in 1685 loop () 1686 1687(** Fold over all tokens *) 1688let fold f init t = 1689 let rec loop acc = 1690 match next t with None -> acc | Some tok -> loop (f acc tok) 1691 in 1692 loop init 1693 1694(** Convert to list *) 1695let to_list t = fold (fun acc tok -> tok :: acc) [] t |> List.rev