Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *) 7 8(** Simple key tracking for mapping key disambiguation *) 9type simple_key = { 10 sk_possible : bool; 11 sk_required : bool; 12 sk_token_number : int; 13 sk_position : Position.t; 14} 15 16(** Indent level tracking *) 17type indent = { 18 indent : int; 19 needs_block_end : bool; 20 sequence : bool; (** true if this is a sequence indent *) 21} 22 23type t = { 24 input : Input.t; 25 mutable tokens : Token.spanned Queue.t; 26 mutable token_number : int; 27 mutable tokens_taken : int; 28 mutable stream_started : bool; 29 mutable stream_ended : bool; 30 mutable indent_stack : indent list; (** Stack of indentation levels *) 31 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *) 32 mutable flow_indent : int; (** Column where outermost flow collection started *) 33 mutable simple_keys : simple_key option list; (** Per flow-level simple key tracking *) 34 mutable allow_simple_key : bool; 35 mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *) 36 mutable document_has_content : bool; (** True if we've emitted content tokens in current document *) 37 mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *) 38 mutable pending_value : bool; (** True if we've emitted a KEY and are waiting for VALUE *) 39 mutable flow_mapping_stack : bool list; (** Stack of whether each flow level is a mapping *) 40} 41 42let create input = 43 { 44 input; 45 tokens = Queue.create (); 46 token_number = 0; 47 tokens_taken = 0; 48 stream_started = false; 49 stream_ended = false; 50 indent_stack = []; 51 flow_level = 0; 52 flow_indent = 0; 53 simple_keys = [None]; (* One entry for the base level *) 54 allow_simple_key = true; 55 leading_whitespace = true; (* Start at beginning of stream *) 56 document_has_content = false; 57 adjacent_value_allowed_at = None; 58 pending_value = false; 59 flow_mapping_stack = []; 60 } 61 62let of_string s = create (Input.of_string s) 63let of_input = create 64let of_reader r = create (Input.of_reader r) 65 66let position t = Input.position t.input 67 68(** Add a token to the queue *) 69let emit t span token = 70 Queue.add { Token.token; span } t.tokens; 71 t.token_number <- t.token_number + 1 72 73(** Get current column (1-indexed) *) 74let column t = (Input.position t.input).column 75 76(** Get current indent level *) 77let current_indent t = 78 match t.indent_stack with 79 | [] -> -1 80 | { indent; _ } :: _ -> indent 81 82(** Skip whitespace to end of line, checking for valid comments. 83 Returns true if any whitespace (including tabs) was found before a comment. *) 84let skip_whitespace_and_comment t = 85 let has_whitespace = ref false in 86 (* Skip blanks (spaces and tabs) *) 87 while Input.next_is_blank t.input do 88 has_whitespace := true; 89 ignore (Input.next t.input) 90 done; 91 (* Check for comment *) 92 if Input.next_is (( = ) '#') t.input then begin 93 (* Validate: comment must be preceded by whitespace or be at start of line *) 94 if not !has_whitespace then begin 95 (* Check if we're at the start of input or after whitespace (blank or line break) *) 96 match Input.peek_back t.input with 97 | None -> () (* Start of input - OK *) 98 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *) 99 | _ -> 100 (* Comment not preceded by whitespace - ERROR *) 101 Error.raise_at (Input.mark t.input) Invalid_comment 102 end; 103 (* Skip to end of line *) 104 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 105 ignore (Input.next t.input) 106 done 107 end 108 109(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *) 110let skip_blanks_check_tabs t = 111 let found_tab = ref false in 112 let found_space = ref false in 113 while Input.next_is_blank t.input do 114 (match Input.peek t.input with 115 | Some '\t' -> found_tab := true 116 | Some ' ' -> found_space := true 117 | _ -> ()); 118 ignore (Input.next t.input) 119 done; 120 (!found_tab, !found_space) 121 122(** Skip whitespace and comments, return true if at newline *) 123let rec skip_to_next_token t = 124 (* Check for tabs used as indentation in block context *) 125 (match Input.peek t.input with 126 | Some '\t' when t.flow_level = 0 && t.leading_whitespace && 127 (column t - 1) < current_indent t -> 128 (* Tab found in indentation zone - this is invalid *) 129 (* Skip to end of line to check if line has content *) 130 let start_pos = Input.mark t.input in 131 while Input.next_is_blank t.input do 132 ignore (Input.next t.input) 133 done; 134 (* If we have content on this line with a tab, raise error *) 135 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then 136 Error.raise_at start_pos Tab_in_indentation 137 | _ -> ()); 138 139 (* Skip blanks and validate comments *) 140 skip_whitespace_and_comment t; 141 (* Skip line break in block context *) 142 if t.flow_level = 0 && Input.next_is_break t.input then begin 143 Input.consume_break t.input; 144 t.allow_simple_key <- true; 145 t.leading_whitespace <- true; 146 skip_to_next_token t 147 end 148 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin 149 (* In flow context, skip all whitespace including line breaks *) 150 if Input.next_is_break t.input then begin 151 Input.consume_break t.input; 152 (* Allow simple keys after line breaks in flow context *) 153 t.allow_simple_key <- true; 154 (* After line break in flow, check for tabs at start of line (Y79Y/03) 155 Tabs are not allowed as indentation - if tab is first char and results 156 in a column less than flow_indent, it's an error *) 157 if Input.next_is (( = ) '\t') t.input then begin 158 (* Tab at start of line in flow context - skip tabs and check position *) 159 let start_mark = Input.mark t.input in 160 while Input.next_is (( = ) '\t') t.input do 161 ignore (Input.next t.input) 162 done; 163 (* If only tabs were used (no spaces) and column < flow_indent, error *) 164 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) && 165 column t < t.flow_indent then 166 Error.raise_at start_mark Invalid_flow_indentation 167 end; 168 skip_to_next_token t 169 end else begin 170 ignore (Input.next t.input); 171 skip_to_next_token t 172 end 173 end 174 175(** Roll the indentation level *) 176let roll_indent t col ~sequence = 177 if t.flow_level = 0 && col > current_indent t then begin 178 t.indent_stack <- { indent = col; needs_block_end = true; sequence } :: t.indent_stack; 179 true 180 end else 181 false 182 183(** Unroll indentation to given column *) 184let unroll_indent t col = 185 while t.flow_level = 0 && 186 match t.indent_stack with 187 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true 188 | _ -> false 189 do 190 match t.indent_stack with 191 | { indent = _; needs_block_end = true; _ } :: rest -> 192 let pos = Input.position t.input in 193 let span = Span.point pos in 194 emit t span Token.Block_end; 195 t.indent_stack <- rest 196 | _ -> () 197 done 198 199(** Save a potential simple key *) 200let save_simple_key t = 201 if t.allow_simple_key then begin 202 (* A simple key is required only if we're in a block context, 203 at the current indentation level, AND the current indent needs a block end. 204 This matches saphyr's logic and prevents false positives for values. *) 205 let required = t.flow_level = 0 && 206 match t.indent_stack with 207 | { indent; needs_block_end = true; _ } :: _ -> 208 indent = column t 209 | _ -> false 210 in 211 let sk = { 212 sk_possible = true; 213 sk_required = required; 214 sk_token_number = t.token_number; 215 sk_position = Input.position t.input; 216 } in 217 (* Remove any existing simple key at current level *) 218 t.simple_keys <- ( 219 match t.simple_keys with 220 | _ :: rest -> Some sk :: rest 221 | [] -> [Some sk] 222 ) 223 end 224 225(** Remove simple key at current level *) 226let remove_simple_key t = 227 match t.simple_keys with 228 | Some sk :: _rest when sk.sk_required -> 229 Error.raise_at sk.sk_position Expected_key 230 | _ :: rest -> t.simple_keys <- None :: rest 231 | [] -> () 232 233(** Stale simple keys that span too many tokens *) 234let stale_simple_keys t = 235 t.simple_keys <- List.map (fun sk_opt -> 236 match sk_opt with 237 | Some sk when sk.sk_possible && 238 (Input.position t.input).line > sk.sk_position.line && 239 t.flow_level = 0 -> 240 if sk.sk_required then 241 Error.raise_at sk.sk_position Expected_key; 242 None 243 | _ -> sk_opt 244 ) t.simple_keys 245 246(** Read anchor or alias name *) 247let scan_anchor_alias t = 248 let start = Input.mark t.input in 249 let buf = Buffer.create 16 in 250 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT: 251 - Whitespace (space, tab, line breaks) 252 - Flow indicators: []{} 253 - Comma (,) 254 This matches the saphyr implementation: is_yaml_non_space && !is_flow *) 255 while 256 match Input.peek t.input with 257 | Some c when not (Input.is_whitespace c) && 258 not (Input.is_flow_indicator c) && 259 c <> '\x00' -> 260 Buffer.add_char buf c; 261 ignore (Input.next t.input); 262 true 263 | _ -> false 264 do () done; 265 let name = Buffer.contents buf in 266 if String.length name = 0 then 267 Error.raise_at start (Invalid_anchor "empty anchor name"); 268 (name, Span.make ~start ~stop:(Input.mark t.input)) 269 270(** Scan tag handle *) 271let scan_tag_handle t = 272 let start = Input.mark t.input in 273 let buf = Buffer.create 16 in 274 (* Expect ! *) 275 (match Input.peek t.input with 276 | Some '!' -> 277 Buffer.add_char buf '!'; 278 ignore (Input.next t.input) 279 | _ -> Error.raise_at start (Invalid_tag "expected '!'")); 280 (* Read word chars *) 281 while 282 match Input.peek t.input with 283 | Some c when Input.is_alnum c || c = '-' -> 284 Buffer.add_char buf c; 285 ignore (Input.next t.input); 286 true 287 | _ -> false 288 do () done; 289 (* Check for secondary ! *) 290 (match Input.peek t.input with 291 | Some '!' -> 292 Buffer.add_char buf '!'; 293 ignore (Input.next t.input) 294 | _ -> ()); 295 Buffer.contents buf 296 297(** Scan tag suffix (after handle) *) 298let scan_tag_suffix t = 299 let is_hex_digit c = 300 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') 301 in 302 let hex_val c = 303 match c with 304 | '0'..'9' -> Char.code c - Char.code '0' 305 | 'A'..'F' -> Char.code c - Char.code 'A' + 10 306 | 'a'..'f' -> Char.code c - Char.code 'a' + 10 307 | _ -> 0 308 in 309 let buf = Buffer.create 32 in 310 while 311 match Input.peek t.input with 312 | Some '%' -> 313 (* Percent-encoded character *) 314 ignore (Input.next t.input); 315 (match Input.peek t.input, Input.peek_nth t.input 1 with 316 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 -> 317 ignore (Input.next t.input); 318 ignore (Input.next t.input); 319 let code = (hex_val c1) * 16 + (hex_val c2) in 320 Buffer.add_char buf (Char.chr code); 321 true 322 | _ -> 323 (* Invalid percent encoding - keep the % *) 324 Buffer.add_char buf '%'; 325 true) 326 | Some c when not (Input.is_whitespace c) && 327 not (Input.is_flow_indicator c) -> 328 Buffer.add_char buf c; 329 ignore (Input.next t.input); 330 true 331 | _ -> false 332 do () done; 333 Buffer.contents buf 334 335(** Scan a tag *) 336let scan_tag t = 337 let start = Input.mark t.input in 338 ignore (Input.next t.input); (* consume ! *) 339 let handle, suffix = 340 match Input.peek t.input with 341 | Some '<' -> 342 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *) 343 ignore (Input.next t.input); 344 let buf = Buffer.create 32 in 345 while 346 match Input.peek t.input with 347 | Some '>' -> false 348 | Some c -> 349 Buffer.add_char buf c; 350 ignore (Input.next t.input); 351 true 352 | None -> Error.raise_at (Input.mark t.input) (Invalid_tag "unclosed verbatim tag") 353 do () done; 354 ignore (Input.next t.input); (* consume > *) 355 ("", Buffer.contents buf) 356 | Some c when Input.is_whitespace c || Input.is_flow_indicator c -> 357 (* Non-specific tag: ! *) 358 ("!", "") 359 | Some '!' -> 360 (* Secondary handle: !! *) 361 ignore (Input.next t.input); (* consume second ! *) 362 let suffix = scan_tag_suffix t in 363 ("!!", suffix) 364 | _ -> 365 (* Primary handle or just suffix: !foo or !e!foo *) 366 (* Read alphanumeric characters *) 367 let buf = Buffer.create 16 in 368 while 369 match Input.peek t.input with 370 | Some c when Input.is_alnum c || c = '-' -> 371 Buffer.add_char buf c; 372 ignore (Input.next t.input); 373 true 374 | _ -> false 375 do () done; 376 (* Check if next character is ! - if so, this is a named handle *) 377 (match Input.peek t.input with 378 | Some '!' -> 379 (* Named handle like !e! *) 380 ignore (Input.next t.input); 381 let handle_name = Buffer.contents buf in 382 let suffix = scan_tag_suffix t in 383 ("!" ^ handle_name ^ "!", suffix) 384 | _ -> 385 (* Just ! followed by suffix *) 386 ("!", Buffer.contents buf ^ scan_tag_suffix t)) 387 in 388 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *) 389 (match Input.peek t.input with 390 | None -> () (* EOF is ok *) 391 | Some c when Input.is_whitespace c || Input.is_break c -> () 392 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> () 393 | _ -> Error.raise_at start (Invalid_tag "expected whitespace or line break after tag")); 394 let span = Span.make ~start ~stop:(Input.mark t.input) in 395 (handle, suffix, span) 396 397(** Scan single-quoted scalar *) 398let scan_single_quoted t = 399 let start = Input.mark t.input in 400 ignore (Input.next t.input); (* consume opening single-quote *) 401 let buf = Buffer.create 64 in 402 let whitespace = Buffer.create 16 in (* Track trailing whitespace *) 403 404 let flush_whitespace () = 405 if Buffer.length whitespace > 0 then begin 406 Buffer.add_buffer buf whitespace; 407 Buffer.clear whitespace 408 end 409 in 410 411 let rec loop () = 412 match Input.peek t.input with 413 | None -> Error.raise_at start Unclosed_single_quote 414 | Some '\'' -> 415 ignore (Input.next t.input); 416 (* Check for escaped quote ('') *) 417 (match Input.peek t.input with 418 | Some '\'' -> 419 flush_whitespace (); 420 Buffer.add_char buf '\''; 421 ignore (Input.next t.input); 422 loop () 423 | _ -> 424 (* End of string - flush any trailing whitespace *) 425 flush_whitespace ()) 426 | Some ' ' | Some '\t' -> 427 (* Track whitespace - don't add to buf yet *) 428 Buffer.add_char whitespace (Option.get (Input.peek t.input)); 429 ignore (Input.next t.input); 430 loop () 431 | Some '\n' | Some '\r' -> 432 (* Discard trailing whitespace before line break *) 433 Buffer.clear whitespace; 434 Input.consume_break t.input; 435 (* Skip leading whitespace on next line *) 436 while Input.next_is_blank t.input do 437 ignore (Input.next t.input) 438 done; 439 (* Check for document boundary *) 440 if Input.at_document_boundary t.input then 441 Error.raise_at start Unclosed_single_quote; 442 (* Check indentation: continuation must be > block indent (QB6E, DK95) *) 443 let col = column t in 444 let indent = current_indent t in 445 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then 446 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 447 (* Count empty lines (consecutive line breaks) *) 448 let empty_lines = ref 0 in 449 while Input.next_is_break t.input do 450 incr empty_lines; 451 Input.consume_break t.input; 452 while Input.next_is_blank t.input do 453 ignore (Input.next t.input) 454 done; 455 if Input.at_document_boundary t.input then 456 Error.raise_at start Unclosed_single_quote; 457 (* Check indentation after each empty line too *) 458 let col = column t in 459 let indent = current_indent t in 460 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then 461 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar") 462 done; 463 (* Apply folding rules *) 464 if !empty_lines > 0 then begin 465 (* Empty lines: preserve as newlines *) 466 for _ = 1 to !empty_lines do 467 Buffer.add_char buf '\n' 468 done 469 end else 470 (* Single break: fold to space (even at start of string) *) 471 Buffer.add_char buf ' '; 472 loop () 473 | Some c -> 474 flush_whitespace (); 475 Buffer.add_char buf c; 476 ignore (Input.next t.input); 477 loop () 478 in 479 loop (); 480 let span = Span.make ~start ~stop:(Input.mark t.input) in 481 (Buffer.contents buf, span) 482 483(** Decode hex escape of given length *) 484let decode_hex t len = 485 let start = Input.mark t.input in 486 let buf = Buffer.create len in 487 for _ = 1 to len do 488 match Input.peek t.input with 489 | Some c when Input.is_hex c -> 490 Buffer.add_char buf c; 491 ignore (Input.next t.input) 492 | _ -> 493 Error.raise_at start (Invalid_hex_escape (Buffer.contents buf)) 494 done; 495 let code = int_of_string ("0x" ^ Buffer.contents buf) in 496 if code <= 0x7F then 497 String.make 1 (Char.chr code) 498 else if code <= 0x7FF then 499 let b1 = 0xC0 lor (code lsr 6) in 500 let b2 = 0x80 lor (code land 0x3F) in 501 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2)) 502 else if code <= 0xFFFF then 503 let b1 = 0xE0 lor (code lsr 12) in 504 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in 505 let b3 = 0x80 lor (code land 0x3F) in 506 String.init 3 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3)) 507 else 508 let b1 = 0xF0 lor (code lsr 18) in 509 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in 510 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in 511 let b4 = 0x80 lor (code land 0x3F) in 512 String.init 4 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4)) 513 514(** Scan double-quoted scalar *) 515let scan_double_quoted t = 516 let start = Input.mark t.input in 517 ignore (Input.next t.input); (* consume opening double-quote *) 518 let buf = Buffer.create 64 in 519 let whitespace = Buffer.create 16 in (* Track pending whitespace *) 520 521 let flush_whitespace () = 522 if Buffer.length whitespace > 0 then begin 523 Buffer.add_buffer buf whitespace; 524 Buffer.clear whitespace 525 end 526 in 527 528 let rec loop () = 529 match Input.peek t.input with 530 | None -> Error.raise_at start Unclosed_double_quote 531 | Some '"' -> 532 (* Flush trailing whitespace before closing quote to preserve it *) 533 flush_whitespace (); 534 ignore (Input.next t.input) 535 | Some ' ' | Some '\t' as c_opt -> 536 (* Track whitespace - don't add to buf yet *) 537 let c = match c_opt with Some c -> c | None -> assert false in 538 Buffer.add_char whitespace c; 539 ignore (Input.next t.input); 540 loop () 541 | Some '\\' -> 542 (* Escape sequence - this is non-whitespace content *) 543 flush_whitespace (); (* Commit any pending whitespace *) 544 ignore (Input.next t.input); 545 (match Input.peek t.input with 546 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>") 547 | Some '0' -> Buffer.add_char buf '\x00'; ignore (Input.next t.input) 548 | Some 'a' -> Buffer.add_char buf '\x07'; ignore (Input.next t.input) 549 | Some 'b' -> Buffer.add_char buf '\x08'; ignore (Input.next t.input) 550 | Some 't' | Some '\t' -> Buffer.add_char buf '\t'; ignore (Input.next t.input) 551 | Some 'n' -> Buffer.add_char buf '\n'; ignore (Input.next t.input) 552 | Some 'v' -> Buffer.add_char buf '\x0B'; ignore (Input.next t.input) 553 | Some 'f' -> Buffer.add_char buf '\x0C'; ignore (Input.next t.input) 554 | Some 'r' -> Buffer.add_char buf '\r'; ignore (Input.next t.input) 555 | Some 'e' -> Buffer.add_char buf '\x1B'; ignore (Input.next t.input) 556 | Some ' ' -> Buffer.add_char buf ' '; ignore (Input.next t.input) 557 | Some '"' -> Buffer.add_char buf '"'; ignore (Input.next t.input) 558 | Some '/' -> Buffer.add_char buf '/'; ignore (Input.next t.input) 559 | Some '\\' -> Buffer.add_char buf '\\'; ignore (Input.next t.input) 560 | Some 'N' -> Buffer.add_string buf "\xC2\x85"; ignore (Input.next t.input) (* NEL *) 561 | Some '_' -> Buffer.add_string buf "\xC2\xA0"; ignore (Input.next t.input) (* NBSP *) 562 | Some 'L' -> Buffer.add_string buf "\xE2\x80\xA8"; ignore (Input.next t.input) (* LS *) 563 | Some 'P' -> Buffer.add_string buf "\xE2\x80\xA9"; ignore (Input.next t.input) (* PS *) 564 | Some 'x' -> 565 ignore (Input.next t.input); 566 Buffer.add_string buf (decode_hex t 2) 567 | Some 'u' -> 568 ignore (Input.next t.input); 569 Buffer.add_string buf (decode_hex t 4) 570 | Some 'U' -> 571 ignore (Input.next t.input); 572 Buffer.add_string buf (decode_hex t 8) 573 | Some '\n' | Some '\r' -> 574 (* Line continuation escape *) 575 Input.consume_break t.input; 576 while Input.next_is_blank t.input do 577 ignore (Input.next t.input) 578 done 579 | Some c -> 580 Error.raise_at (Input.mark t.input) 581 (Invalid_escape_sequence (Printf.sprintf "\\%c" c))); 582 loop () 583 | Some '\n' | Some '\r' -> 584 (* Line break: discard any pending trailing whitespace *) 585 Buffer.clear whitespace; 586 Input.consume_break t.input; 587 (* Count consecutive line breaks (empty lines) *) 588 let empty_lines = ref 0 in 589 let continue = ref true in 590 let started_with_tab = ref false in 591 while !continue do 592 (* Track if we start with a tab (for DK95/01 check) *) 593 if Input.next_is (( = ) '\t') t.input then started_with_tab := true; 594 (* Skip blanks (spaces/tabs) on the line *) 595 while Input.next_is_blank t.input do 596 ignore (Input.next t.input) 597 done; 598 (* Check if we hit another line break (empty line) *) 599 if Input.next_is_break t.input then begin 600 Input.consume_break t.input; 601 incr empty_lines; 602 started_with_tab := false (* Reset for next line *) 603 end else 604 continue := false 605 done; 606 (* Check for document boundary - this terminates the quoted string *) 607 if Input.at_document_boundary t.input then 608 Error.raise_at start Unclosed_double_quote; 609 (* Check indentation: continuation must be > block indent (QB6E, DK95) 610 Note: must be strictly greater than block indent, not just equal *) 611 let col = column t in 612 let indent = current_indent t in 613 let start_col = start.column in 614 (* DK95/01: if continuation started with tabs and column < start column, error *) 615 if not (Input.is_eof t.input) && !started_with_tab && col < start_col then 616 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 617 if not (Input.is_eof t.input) && col <= indent && indent >= 0 then 618 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 619 (* Per YAML spec: single break = space, break + empty lines = newlines *) 620 if !empty_lines > 0 then begin 621 (* Empty lines: output N newlines where N = number of empty lines *) 622 for _ = 1 to !empty_lines do 623 Buffer.add_char buf '\n' 624 done 625 end else 626 (* Single break folds to space *) 627 Buffer.add_char buf ' '; 628 loop () 629 | Some c -> 630 (* Non-whitespace character *) 631 flush_whitespace (); (* Commit any pending whitespace *) 632 Buffer.add_char buf c; 633 ignore (Input.next t.input); 634 loop () 635 in 636 loop (); 637 let span = Span.make ~start ~stop:(Input.mark t.input) in 638 (Buffer.contents buf, span) 639 640(** Check if character can appear in plain scalar at this position *) 641let can_continue_plain t c ~in_flow = 642 match c with 643 | ':' -> 644 (* : is OK if not followed by whitespace or flow indicator *) 645 (match Input.peek_nth t.input 1 with 646 | None -> true 647 | Some c2 when Input.is_whitespace c2 -> false 648 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false 649 | _ -> true) 650 | '#' -> 651 (* # is a comment indicator only if preceded by whitespace *) 652 (* Check the previous character to determine if this is a comment *) 653 (match Input.peek_back t.input with 654 | None -> true (* At start - can't be comment indicator, allow it *) 655 | Some c when Input.is_whitespace c -> false (* Preceded by whitespace - comment *) 656 | Some c when Input.is_break c -> false (* At start of line - comment *) 657 | _ -> true) (* Not preceded by whitespace - part of scalar *) 658 | c when in_flow && Input.is_flow_indicator c -> false 659 | _ when Input.is_break c -> false 660 | _ -> true 661 662(** Scan plain scalar *) 663let scan_plain_scalar t = 664 let start = Input.mark t.input in 665 let in_flow = t.flow_level > 0 in 666 let indent = current_indent t in 667 (* In flow context, scalars must be indented more than the current block indent. 668 This ensures that content at block indent or less ends the flow context. *) 669 if in_flow && (column t - 1) < indent then 670 Error.raise_at start Invalid_flow_indentation; 671 let buf = Buffer.create 64 in 672 let spaces = Buffer.create 16 in 673 let whitespace = Buffer.create 16 in (* Track whitespace within a line *) 674 let leading_blanks = ref false in 675 676 let rec scan_line () = 677 match Input.peek t.input with 678 | None -> () 679 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow -> 680 (* Blank character within a line - save to whitespace buffer *) 681 Buffer.add_char whitespace c; 682 ignore (Input.next t.input); 683 scan_line () 684 | Some c when can_continue_plain t c ~in_flow -> 685 (* Non-blank character - process any pending breaks/whitespace first *) 686 begin 687 if Buffer.length spaces > 0 then begin 688 if !leading_blanks then begin 689 (* Fold line break *) 690 if Buffer.contents spaces = "\n" then 691 Buffer.add_char buf ' ' 692 else begin 693 (* Multiple breaks - preserve all but first *) 694 let s = Buffer.contents spaces in 695 Buffer.add_substring buf s 1 (String.length s - 1) 696 end 697 end else 698 Buffer.add_buffer buf spaces; 699 Buffer.clear spaces 700 end; 701 (* Add any pending whitespace from within the line *) 702 if Buffer.length whitespace > 0 then begin 703 Buffer.add_buffer buf whitespace; 704 Buffer.clear whitespace 705 end; 706 (* Add the character *) 707 Buffer.add_char buf c; 708 ignore (Input.next t.input); 709 leading_blanks := false; 710 scan_line () 711 end 712 | _ -> () 713 in 714 715 let rec scan_lines () = 716 scan_line (); 717 (* Check for line continuation *) 718 if Input.next_is_break t.input then begin 719 (* Discard any trailing whitespace from the current line *) 720 Buffer.clear whitespace; 721 (* Save the line break *) 722 if !leading_blanks then begin 723 (* We already had a break - this is an additional break (empty line) *) 724 Buffer.add_char spaces '\n' 725 end else begin 726 (* First line break *) 727 Buffer.clear spaces; 728 Buffer.add_char spaces '\n'; 729 leading_blanks := true 730 end; 731 Input.consume_break t.input; 732 (* Note: We do NOT set allow_simple_key here during plain scalar scanning. 733 Setting it here would incorrectly allow ':' that appears on a continuation 734 line to become a mapping indicator. The flag will be set properly after 735 the scalar ends and skip_to_next_token processes line breaks. *) 736 (* Skip leading blanks on the next line *) 737 while Input.next_is_blank t.input do 738 ignore (Input.next t.input) 739 done; 740 let col = (Input.position t.input).column in 741 (* Check indentation - stop if we're at or before the containing block's indent *) 742 (* However, allow empty lines (line breaks) to continue even if dedented *) 743 if Input.next_is_break t.input then 744 scan_lines () (* Empty line - continue *) 745 else if not in_flow && col <= indent then 746 () (* Stop - dedented or at parent level in block context *) 747 else if Input.at_document_boundary t.input then 748 () (* Stop - document boundary *) 749 else 750 scan_lines () 751 end 752 in 753 754 scan_lines (); 755 let value = Buffer.contents buf in 756 (* Trim trailing whitespace (spaces and tabs) *) 757 let value = 758 let len = String.length value in 759 let rec find_end i = 760 if i < 0 then 0 761 else match value.[i] with 762 | ' ' | '\t' -> find_end (i - 1) 763 | _ -> i + 1 764 in 765 let end_pos = find_end (len - 1) in 766 String.sub value 0 end_pos 767 in 768 let span = Span.make ~start ~stop:(Input.mark t.input) in 769 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *) 770 (value, span, !leading_blanks) 771 772(** Scan block scalar (literal | or folded >) *) 773let scan_block_scalar t literal = 774 let start = Input.mark t.input in 775 ignore (Input.next t.input); (* consume | or > *) 776 777 (* Parse header: optional indentation indicator and chomping *) 778 let explicit_indent = ref None in 779 let chomping = ref Chomping.Clip in 780 781 (* First character of header *) 782 (match Input.peek t.input with 783 | Some c when Input.is_digit c && c <> '0' -> 784 explicit_indent := Some (Char.code c - Char.code '0'); 785 ignore (Input.next t.input) 786 | Some '-' -> chomping := Chomping.Strip; ignore (Input.next t.input) 787 | Some '+' -> chomping := Chomping.Keep; ignore (Input.next t.input) 788 | _ -> ()); 789 790 (* Second character of header *) 791 (match Input.peek t.input with 792 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None -> 793 explicit_indent := Some (Char.code c - Char.code '0'); 794 ignore (Input.next t.input) 795 | Some '-' when !chomping = Chomping.Clip -> 796 chomping := Chomping.Strip; ignore (Input.next t.input) 797 | Some '+' when !chomping = Chomping.Clip -> 798 chomping := Chomping.Keep; ignore (Input.next t.input) 799 | _ -> ()); 800 801 (* Skip whitespace and optional comment *) 802 skip_whitespace_and_comment t; 803 804 (* Consume line break *) 805 if Input.next_is_break t.input then 806 Input.consume_break t.input 807 else if not (Input.is_eof t.input) then 808 Error.raise_at (Input.mark t.input) 809 (Invalid_block_scalar_header "expected newline after header"); 810 811 let base_indent = current_indent t in 812 (* base_indent is the indent level from the stack, -1 if empty. 813 It's used directly for comparisons in implicit indent case. *) 814 let content_indent = ref ( 815 match !explicit_indent with 816 | Some n -> 817 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed. 818 content_indent = (base_indent - 1) + n, but at least n for document level. *) 819 let base_level = max 0 (base_indent - 1) in 820 base_level + n 821 | None -> 0 (* Will be determined by first non-empty line *) 822 ) in 823 824 let buf = Buffer.create 256 in 825 let trailing_breaks = Buffer.create 16 in 826 let leading_blank = ref false in (* Was the previous line "more indented"? *) 827 let max_empty_line_indent = ref 0 in (* Track max indent of empty lines before first content *) 828 829 (* Skip to content indentation, skipping empty lines. 830 Returns the number of spaces actually skipped (important for detecting dedentation). *) 831 let rec skip_to_content_indent () = 832 if !content_indent > 0 then begin 833 (* Explicit indent - skip up to content_indent spaces *) 834 let spaces_skipped = ref 0 in 835 while !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input do 836 incr spaces_skipped; 837 ignore (Input.next t.input) 838 done; 839 840 (* Check if this line is empty (only spaces/tabs until break/eof) *) 841 if Input.next_is_break t.input then begin 842 (* Empty line - record the break and continue *) 843 Buffer.add_char trailing_breaks '\n'; 844 Input.consume_break t.input; 845 skip_to_content_indent () 846 end else if !spaces_skipped < !content_indent then begin 847 (* Line starts with fewer spaces than content_indent - dedented *) 848 !spaces_skipped 849 end else if Input.next_is_blank t.input then begin 850 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line. 851 For literal scalars, whitespace-only lines ARE content (not empty). 852 For folded scalars, whitespace-only lines that are "more indented" are preserved. *) 853 if literal then 854 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *) 855 !content_indent 856 else begin 857 (* Folded: check if rest is only blanks *) 858 let idx = ref 0 in 859 while match Input.peek_nth t.input !idx with 860 | Some c when Input.is_blank c -> incr idx; true 861 | _ -> false 862 do () done; 863 match Input.peek_nth t.input (!idx) with 864 | None | Some '\n' | Some '\r' -> 865 (* Empty/whitespace-only line in folded - skip spaces *) 866 while Input.next_is_blank t.input do 867 ignore (Input.next t.input) 868 done; 869 Buffer.add_char trailing_breaks '\n'; 870 Input.consume_break t.input; 871 skip_to_content_indent () 872 | _ -> 873 (* Has non-whitespace content *) 874 !content_indent 875 end 876 end else 877 !content_indent 878 end else begin 879 (* Implicit indent - skip empty lines without consuming spaces. 880 Note: Only SPACES count as indentation. Tabs are content, not indentation. 881 So we only check for spaces when determining if a line is "empty". *) 882 if Input.next_is_break t.input then begin 883 Buffer.add_char trailing_breaks '\n'; 884 Input.consume_break t.input; 885 skip_to_content_indent () 886 end else if Input.next_is (( = ) ' ') t.input then begin 887 (* Check if line is empty (only spaces before break) *) 888 let idx = ref 0 in 889 while match Input.peek_nth t.input !idx with 890 | Some ' ' -> incr idx; true 891 | _ -> false 892 do () done; 893 match Input.peek_nth t.input (!idx) with 894 | None | Some '\n' | Some '\r' -> 895 (* Line has only spaces - empty line *) 896 (* Track max indent of empty lines for later validation *) 897 if !idx > !max_empty_line_indent then 898 max_empty_line_indent := !idx; 899 while Input.next_is (( = ) ' ') t.input do 900 ignore (Input.next t.input) 901 done; 902 Buffer.add_char trailing_breaks '\n'; 903 Input.consume_break t.input; 904 skip_to_content_indent () 905 | _ -> 906 (* Has content (including tabs which are content, not indentation) *) 907 0 908 end else if Input.next_is (( = ) '\t') t.input then begin 909 (* Tab at start of line in implicit indent mode - this is an error (Y79Y) 910 because tabs cannot be used as indentation in YAML *) 911 Error.raise_at (Input.mark t.input) Tab_in_indentation 912 end else 913 (* Not at break or space - other content character *) 914 0 915 end 916 in 917 918 (* Read content *) 919 let rec read_lines () = 920 let spaces_skipped = skip_to_content_indent () in 921 922 (* Check if we're at content *) 923 if Input.is_eof t.input then () 924 else if Input.at_document_boundary t.input then () 925 else begin 926 (* Count additional leading spaces beyond what was skipped *) 927 let extra_spaces = ref 0 in 928 while Input.next_is (( = ) ' ') t.input do 929 incr extra_spaces; 930 ignore (Input.next t.input) 931 done; 932 933 (* Calculate actual line indentation *) 934 let line_indent = spaces_skipped + !extra_spaces in 935 936 (* Determine content indent from first content line (implicit case) *) 937 let first_line = !content_indent = 0 in 938 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent. 939 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *) 940 let base_level = base_indent - 1 in 941 let should_process = 942 if !content_indent = 0 then begin 943 (* For implicit indent, content must be more indented than base_level. *) 944 if line_indent <= base_level then 945 false (* No content - first line not indented enough *) 946 else begin 947 (* Validate: first content line must be indented at least as much as 948 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *) 949 if line_indent < !max_empty_line_indent && line_indent > base_level then 950 Error.raise_at (Input.mark t.input) 951 (Invalid_block_scalar_header "wrongly indented line in block scalar"); 952 content_indent := line_indent; 953 true 954 end 955 end else if line_indent < !content_indent then 956 false (* Dedented - done with content *) 957 else 958 true 959 in 960 961 if should_process then begin 962 (* Check if current line is "more indented" (has extra indent or starts with whitespace). 963 For folded scalars, lines that start with any whitespace (space or tab) after the 964 content indentation are "more indented" and preserve breaks. 965 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *) 966 let trailing_blank = line_indent > !content_indent || Input.next_is_blank t.input in 967 968 (* Add trailing breaks to buffer *) 969 if Buffer.length buf > 0 then begin 970 if Buffer.length trailing_breaks > 0 then begin 971 if literal then 972 Buffer.add_buffer buf trailing_breaks 973 else begin 974 (* Folded scalar: fold only if both previous and current lines are not more-indented *) 975 if not !leading_blank && not trailing_blank then begin 976 let breaks = Buffer.contents trailing_breaks in 977 if String.length breaks = 1 then 978 Buffer.add_char buf ' ' 979 else 980 Buffer.add_substring buf breaks 1 (String.length breaks - 1) 981 end else begin 982 (* Preserve breaks for more-indented lines *) 983 Buffer.add_buffer buf trailing_breaks 984 end 985 end 986 end else if not literal then 987 Buffer.add_char buf ' ' 988 end else 989 Buffer.add_buffer buf trailing_breaks; 990 Buffer.clear trailing_breaks; 991 992 (* Add extra indentation for literal or more-indented folded lines *) 993 (* On the first line (when determining content_indent), we've already consumed all spaces, 994 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *) 995 if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin 996 for _ = 1 to !extra_spaces do 997 Buffer.add_char buf ' ' 998 done 999 end; 1000 1001 (* Read line content *) 1002 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 1003 Buffer.add_char buf (Input.next_exn t.input) 1004 done; 1005 1006 (* Record trailing break *) 1007 if Input.next_is_break t.input then begin 1008 Buffer.add_char trailing_breaks '\n'; 1009 Input.consume_break t.input 1010 end; 1011 1012 (* Update leading_blank for next iteration *) 1013 leading_blank := trailing_blank; 1014 1015 read_lines () 1016 end 1017 end 1018 in 1019 1020 read_lines (); 1021 1022 (* Apply chomping *) 1023 let value = 1024 let content = Buffer.contents buf in 1025 match !chomping with 1026 | Chomping.Strip -> content 1027 | Chomping.Clip -> 1028 if String.length content > 0 then content ^ "\n" else content 1029 | Chomping.Keep -> 1030 content ^ Buffer.contents trailing_breaks 1031 in 1032 1033 let span = Span.make ~start ~stop:(Input.mark t.input) in 1034 let style = if literal then `Literal else `Folded in 1035 (value, style, span) 1036 1037(** Scan directive (after %) *) 1038let scan_directive t = 1039 let start = Input.mark t.input in 1040 ignore (Input.next t.input); (* consume % *) 1041 1042 (* Read directive name *) 1043 let name_buf = Buffer.create 16 in 1044 while 1045 match Input.peek t.input with 1046 | Some c when Input.is_alnum c || c = '-' -> 1047 Buffer.add_char name_buf c; 1048 ignore (Input.next t.input); 1049 true 1050 | _ -> false 1051 do () done; 1052 let name = Buffer.contents name_buf in 1053 1054 (* Skip blanks *) 1055 while Input.next_is_blank t.input do 1056 ignore (Input.next t.input) 1057 done; 1058 1059 match name with 1060 | "YAML" -> 1061 (* Version directive: %YAML 1.2 *) 1062 let major = ref 0 in 1063 let minor = ref 0 in 1064 (* Read major version *) 1065 while Input.next_is_digit t.input do 1066 major := !major * 10 + (Char.code (Input.next_exn t.input) - Char.code '0') 1067 done; 1068 (* Expect . *) 1069 (match Input.peek t.input with 1070 | Some '.' -> ignore (Input.next t.input) 1071 | _ -> Error.raise_at (Input.mark t.input) (Invalid_yaml_version "expected '.'")); 1072 (* Read minor version *) 1073 while Input.next_is_digit t.input do 1074 minor := !minor * 10 + (Char.code (Input.next_exn t.input) - Char.code '0') 1075 done; 1076 (* Validate: only whitespace and comments allowed before line break (MUS6) *) 1077 skip_whitespace_and_comment t; 1078 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then 1079 Error.raise_at (Input.mark t.input) (Invalid_directive "expected comment or line break after version"); 1080 let span = Span.make ~start ~stop:(Input.mark t.input) in 1081 Token.Version_directive { major = !major; minor = !minor }, span 1082 1083 | "TAG" -> 1084 (* Tag directive: %TAG !foo! tag:example.com,2000: *) 1085 let handle = scan_tag_handle t in 1086 (* Skip blanks *) 1087 while Input.next_is_blank t.input do 1088 ignore (Input.next t.input) 1089 done; 1090 (* Read prefix *) 1091 let prefix_buf = Buffer.create 32 in 1092 while 1093 match Input.peek t.input with 1094 | Some c when not (Input.is_whitespace c) -> 1095 Buffer.add_char prefix_buf c; 1096 ignore (Input.next t.input); 1097 true 1098 | _ -> false 1099 do () done; 1100 let prefix = Buffer.contents prefix_buf in 1101 let span = Span.make ~start ~stop:(Input.mark t.input) in 1102 Token.Tag_directive { handle; prefix }, span 1103 1104 | _ -> 1105 (* Reserved/Unknown directive - skip to end of line and ignore *) 1106 (* Per YAML spec, reserved directives should be ignored with a warning *) 1107 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 1108 ignore (Input.next t.input) 1109 done; 1110 let span = Span.make ~start ~stop:(Input.mark t.input) in 1111 (* Return an empty tag directive token to indicate directive was processed but ignored *) 1112 Token.Tag_directive { handle = ""; prefix = "" }, span 1113 1114(** Fetch the next token(s) into the queue *) 1115let rec fetch_next_token t = 1116 skip_to_next_token t; 1117 stale_simple_keys t; 1118 let col = column t in 1119 (* Unroll indents that are deeper than current column. 1120 Note: we use col, not col-1, to allow entries at the same level. *) 1121 unroll_indent t col; 1122 1123 (* We're about to process actual content, not leading whitespace *) 1124 t.leading_whitespace <- false; 1125 1126 if Input.is_eof t.input then 1127 fetch_stream_end t 1128 else if Input.at_document_boundary t.input then 1129 fetch_document_indicator t 1130 else begin 1131 match Input.peek t.input with 1132 | None -> fetch_stream_end t 1133 | Some '%' when (Input.position t.input).column = 1 -> 1134 fetch_directive t 1135 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start 1136 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start 1137 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end 1138 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end 1139 | Some ',' -> fetch_flow_entry t 1140 | Some '-' when t.flow_level = 0 && check_block_entry t -> 1141 fetch_block_entry t 1142 | Some '?' when check_key t -> 1143 fetch_key t 1144 | Some ':' when check_value t -> 1145 fetch_value t 1146 | Some '*' -> fetch_alias t 1147 | Some '&' -> fetch_anchor t 1148 | Some '!' -> fetch_tag t 1149 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true 1150 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false 1151 | Some '\'' -> fetch_single_quoted t 1152 | Some '"' -> fetch_double_quoted t 1153 | Some '-' when can_start_plain t -> 1154 fetch_plain_scalar t 1155 | Some '?' when can_start_plain t -> 1156 fetch_plain_scalar t 1157 | Some ':' when can_start_plain t -> 1158 fetch_plain_scalar t 1159 | Some c when can_start_plain_char c t -> 1160 fetch_plain_scalar t 1161 | Some c -> 1162 Error.raise_at (Input.mark t.input) (Unexpected_character c) 1163 end 1164 1165and fetch_stream_end t = 1166 if not t.stream_ended then begin 1167 unroll_indent t (-1); 1168 remove_simple_key t; 1169 t.allow_simple_key <- false; 1170 t.stream_ended <- true; 1171 let span = Span.point (Input.mark t.input) in 1172 emit t span Token.Stream_end 1173 end 1174 1175and fetch_document_indicator t = 1176 unroll_indent t (-1); 1177 remove_simple_key t; 1178 t.allow_simple_key <- false; 1179 let start = Input.mark t.input in 1180 let indicator = Input.peek_string t.input 3 in 1181 Input.skip t.input 3; 1182 let span = Span.make ~start ~stop:(Input.mark t.input) in 1183 let token = if indicator = "---" then Token.Document_start else Token.Document_end in 1184 (* Reset document content flag after document end marker *) 1185 if indicator = "..." then begin 1186 t.document_has_content <- false; 1187 (* After document end marker, skip whitespace and check for end of line or comment *) 1188 while Input.next_is_blank t.input do ignore (Input.next t.input) done; 1189 (match Input.peek t.input with 1190 | None -> () (* EOF is ok *) 1191 | Some c when Input.is_break c -> () 1192 | Some '#' -> () (* Comment is ok *) 1193 | _ -> Error.raise_at start (Invalid_directive "content not allowed after document end marker on same line")) 1194 end; 1195 emit t span token 1196 1197and fetch_directive t = 1198 (* Directives can only appear: 1199 1. At stream start (before any document content) 1200 2. After a document end marker (...) 1201 If we've emitted content in the current document, we need a document end marker first *) 1202 if t.document_has_content then 1203 Error.raise_at (Input.mark t.input) 1204 (Unexpected_token "directives must be separated from document content by document end marker (...)"); 1205 unroll_indent t (-1); 1206 remove_simple_key t; 1207 t.allow_simple_key <- false; 1208 let token, span = scan_directive t in 1209 emit t span token 1210 1211and fetch_flow_collection_start t token_type = 1212 save_simple_key t; 1213 (* Record indent of outermost flow collection *) 1214 if t.flow_level = 0 then 1215 t.flow_indent <- column t; 1216 t.flow_level <- t.flow_level + 1; 1217 (* Track whether this is a mapping or sequence *) 1218 let is_mapping = (token_type = Token.Flow_mapping_start) in 1219 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack; 1220 t.allow_simple_key <- true; 1221 t.simple_keys <- None :: t.simple_keys; 1222 t.document_has_content <- true; 1223 let start = Input.mark t.input in 1224 ignore (Input.next t.input); 1225 let span = Span.make ~start ~stop:(Input.mark t.input) in 1226 emit t span token_type 1227 1228and fetch_flow_collection_end t token_type = 1229 remove_simple_key t; 1230 t.flow_level <- t.flow_level - 1; 1231 t.flow_mapping_stack <- (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []); 1232 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []); 1233 t.allow_simple_key <- false; 1234 let start = Input.mark t.input in 1235 ignore (Input.next t.input); 1236 (* Allow adjacent values after flow collection ends *) 1237 if t.flow_level > 0 then 1238 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1239 let span = Span.make ~start ~stop:(Input.mark t.input) in 1240 emit t span token_type 1241 1242and fetch_flow_entry t = 1243 remove_simple_key t; 1244 t.allow_simple_key <- true; 1245 let start = Input.mark t.input in 1246 ignore (Input.next t.input); 1247 let span = Span.make ~start ~stop:(Input.mark t.input) in 1248 emit t span Token.Flow_entry 1249 1250and check_block_entry t = 1251 (* - followed by whitespace or EOF *) 1252 match Input.peek_nth t.input 1 with 1253 | None -> true 1254 | Some c -> Input.is_whitespace c 1255 1256and fetch_block_entry t = 1257 if t.flow_level = 0 then begin 1258 (* Block entries require allow_simple_key to be true. 1259 This prevents block sequences on the same line as a mapping value, 1260 e.g., "key: - a" is invalid. *) 1261 if not t.allow_simple_key then 1262 Error.raise_at (Input.mark t.input) Block_sequence_disallowed; 1263 let col = column t in 1264 if roll_indent t col ~sequence:true then begin 1265 let span = Span.point (Input.mark t.input) in 1266 emit t span Token.Block_sequence_start 1267 end 1268 end; 1269 remove_simple_key t; 1270 t.allow_simple_key <- true; 1271 t.document_has_content <- true; 1272 let start = Input.mark t.input in 1273 ignore (Input.next t.input); 1274 1275 (* Check for tabs after - : pattern like -\t- is invalid *) 1276 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in 1277 if found_tabs then begin 1278 (* If we found tabs and next char is - followed by whitespace, error *) 1279 match Input.peek t.input with 1280 | Some '-' -> 1281 (match Input.peek_nth t.input 1 with 1282 | None -> Error.raise_at start Tab_in_indentation 1283 | Some c when Input.is_whitespace c -> 1284 Error.raise_at start Tab_in_indentation 1285 | Some _ -> ()) 1286 | _ -> () 1287 end; 1288 1289 let span = Span.make ~start ~stop:(Input.mark t.input) in 1290 emit t span Token.Block_entry 1291 1292and check_key t = 1293 (* ? followed by whitespace or flow indicator in both block and flow *) 1294 match Input.peek_nth t.input 1 with 1295 | None -> true 1296 | Some c -> 1297 Input.is_whitespace c || 1298 (t.flow_level > 0 && Input.is_flow_indicator c) 1299 1300and fetch_key t = 1301 if t.flow_level = 0 then begin 1302 if not t.allow_simple_key then 1303 Error.raise_at (Input.mark t.input) Expected_key; 1304 let col = column t in 1305 if roll_indent t col ~sequence:false then begin 1306 let span = Span.point (Input.mark t.input) in 1307 emit t span Token.Block_mapping_start 1308 end 1309 end; 1310 remove_simple_key t; 1311 t.allow_simple_key <- t.flow_level = 0; 1312 t.document_has_content <- true; 1313 let start = Input.mark t.input in 1314 ignore (Input.next t.input); 1315 1316 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *) 1317 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in 1318 if found_tabs && t.flow_level = 0 then begin 1319 (* In block context, tabs after ? are not allowed *) 1320 Error.raise_at start Tab_in_indentation 1321 end; 1322 1323 let span = Span.make ~start ~stop:(Input.mark t.input) in 1324 emit t span Token.Key; 1325 t.pending_value <- true (* We've emitted a KEY, now waiting for VALUE *) 1326 1327and check_value t = 1328 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *) 1329 match Input.peek_nth t.input 1 with 1330 | None -> true 1331 | Some c -> 1332 Input.is_whitespace c || 1333 (t.flow_level > 0 && Input.is_flow_indicator c) || 1334 (* Allow adjacent values in flow context at designated positions *) 1335 (t.flow_level > 0 && 1336 match t.adjacent_value_allowed_at with 1337 | Some pos -> pos.Position.line = (Input.position t.input).Position.line && 1338 pos.Position.column = (Input.position t.input).Position.column 1339 | None -> false) 1340 1341and fetch_value t = 1342 let start = Input.mark t.input in 1343 (* Check for simple key *) 1344 let used_simple_key = 1345 match t.simple_keys with 1346 | Some sk :: _ when sk.sk_possible -> 1347 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line. 1348 In explicit flow mapping { }, key and : can span lines. *) 1349 let is_implicit_flow_mapping = match t.flow_mapping_stack with 1350 | false :: _ -> true (* false = we're in a sequence, so any mapping is implicit *) 1351 | _ -> false 1352 in 1353 if is_implicit_flow_mapping && sk.sk_position.line < (Input.position t.input).line then 1354 Error.raise_at start Illegal_flow_key_line; 1355 (* Insert KEY token before the simple key value *) 1356 let key_span = Span.point sk.sk_position in 1357 let key_token = { Token.token = Token.Key; span = key_span } in 1358 (* We need to insert at the right position *) 1359 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1360 Queue.clear t.tokens; 1361 let insert_pos = sk.sk_token_number - t.tokens_taken in 1362 Array.iteri (fun i tok -> 1363 if i = insert_pos then Queue.add key_token t.tokens; 1364 Queue.add tok t.tokens 1365 ) tokens; 1366 if insert_pos >= Array.length tokens then 1367 Queue.add key_token t.tokens; 1368 t.token_number <- t.token_number + 1; 1369 t.pending_value <- true; (* We've inserted a KEY token, now waiting for VALUE *) 1370 (* Roll indent for implicit block mapping *) 1371 if t.flow_level = 0 then begin 1372 let col = sk.sk_position.column in 1373 if roll_indent t col ~sequence:false then begin 1374 let span = Span.point sk.sk_position in 1375 (* Insert block mapping start before key *) 1376 let bm_token = { Token.token = Token.Block_mapping_start; span } in 1377 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1378 Queue.clear t.tokens; 1379 Array.iteri (fun i tok -> 1380 if i = insert_pos then Queue.add bm_token t.tokens; 1381 Queue.add tok t.tokens 1382 ) tokens; 1383 if insert_pos >= Array.length tokens then 1384 Queue.add bm_token t.tokens; 1385 t.token_number <- t.token_number + 1 1386 end 1387 end; 1388 t.simple_keys <- None :: (List.tl t.simple_keys); 1389 true 1390 | _ -> 1391 (* No simple key - this is a complex value (or empty key) *) 1392 if t.flow_level = 0 then begin 1393 if not t.allow_simple_key then 1394 Error.raise_at (Input.mark t.input) Expected_key; 1395 let col = column t in 1396 if roll_indent t col ~sequence:false then begin 1397 let span = Span.point (Input.mark t.input) in 1398 emit t span Token.Block_mapping_start 1399 end 1400 (* Note: We don't emit KEY here. Empty key handling is done by the parser, 1401 which emits empty scalar when it sees VALUE without preceding KEY. *) 1402 end; 1403 false 1404 in 1405 remove_simple_key t; 1406 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white" 1407 In flow context or after using a simple key, disallow simple keys *) 1408 t.allow_simple_key <- (not used_simple_key) && (t.flow_level = 0); 1409 t.document_has_content <- true; 1410 let start = Input.mark t.input in 1411 ignore (Input.next t.input); 1412 1413 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09) 1414 However, :\t bar (tab followed by space then content) is valid (6BCT) *) 1415 let (found_tabs, found_spaces) = skip_blanks_check_tabs t in 1416 if found_tabs && not found_spaces && t.flow_level = 0 then begin 1417 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *) 1418 match Input.peek t.input with 1419 | Some ('-' | '?') -> 1420 Error.raise_at start Tab_in_indentation 1421 | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') -> 1422 (* Tab-only followed by alphanumeric - likely a key, which is invalid *) 1423 Error.raise_at start Tab_in_indentation 1424 | _ -> () 1425 end; 1426 1427 (* Skip any comment that may follow the colon and whitespace *) 1428 skip_whitespace_and_comment t; 1429 1430 let span = Span.make ~start ~stop:(Input.mark t.input) in 1431 emit t span Token.Value; 1432 t.pending_value <- false (* We've emitted a VALUE, no longer pending *) 1433 1434and fetch_alias t = 1435 save_simple_key t; 1436 t.allow_simple_key <- false; 1437 t.document_has_content <- true; 1438 let start = Input.mark t.input in 1439 ignore (Input.next t.input); (* consume * *) 1440 let name, span = scan_anchor_alias t in 1441 let span = Span.make ~start ~stop:span.stop in 1442 emit t span (Token.Alias name) 1443 1444and fetch_anchor t = 1445 save_simple_key t; 1446 t.allow_simple_key <- false; 1447 t.document_has_content <- true; 1448 let start = Input.mark t.input in 1449 ignore (Input.next t.input); (* consume & *) 1450 let name, span = scan_anchor_alias t in 1451 let span = Span.make ~start ~stop:span.stop in 1452 emit t span (Token.Anchor name) 1453 1454and fetch_tag t = 1455 save_simple_key t; 1456 t.allow_simple_key <- false; 1457 t.document_has_content <- true; 1458 let handle, suffix, span = scan_tag t in 1459 emit t span (Token.Tag { handle; suffix }) 1460 1461and fetch_block_scalar t literal = 1462 remove_simple_key t; 1463 t.allow_simple_key <- true; 1464 t.document_has_content <- true; 1465 let value, style, span = scan_block_scalar t literal in 1466 emit t span (Token.Scalar { style; value }) 1467 1468and fetch_single_quoted t = 1469 save_simple_key t; 1470 t.allow_simple_key <- false; 1471 t.document_has_content <- true; 1472 let value, span = scan_single_quoted t in 1473 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *) 1474 skip_to_next_token t; 1475 if t.flow_level > 0 then 1476 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1477 emit t span (Token.Scalar { style = `Single_quoted; value }) 1478 1479and fetch_double_quoted t = 1480 save_simple_key t; 1481 t.allow_simple_key <- false; 1482 t.document_has_content <- true; 1483 let value, span = scan_double_quoted t in 1484 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *) 1485 skip_to_next_token t; 1486 if t.flow_level > 0 then 1487 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1488 emit t span (Token.Scalar { style = `Double_quoted; value }) 1489 1490and can_start_plain t = 1491 (* Check if - ? : can start a plain scalar *) 1492 match Input.peek_nth t.input 1 with 1493 | None -> false 1494 | Some c -> 1495 not (Input.is_whitespace c) && 1496 (t.flow_level = 0 || not (Input.is_flow_indicator c)) 1497 1498and can_start_plain_char c _t = 1499 (* Characters that can start a plain scalar *) 1500 if Input.is_whitespace c then false 1501 else if Input.is_indicator c then false 1502 else true 1503 1504and fetch_plain_scalar t = 1505 save_simple_key t; 1506 t.allow_simple_key <- false; 1507 t.document_has_content <- true; 1508 let value, span, ended_with_linebreak = scan_plain_scalar t in 1509 (* If the plain scalar ended after crossing a line break (leading_blanks = true), 1510 allow simple keys. This is important because the scanner already consumed the 1511 line break and leading whitespace when checking for continuation. *) 1512 if ended_with_linebreak then 1513 t.allow_simple_key <- true; 1514 emit t span (Token.Scalar { style = `Plain; value }) 1515 1516(** Check if we need more tokens to resolve simple keys *) 1517let need_more_tokens t = 1518 if t.stream_ended then false 1519 else if Queue.is_empty t.tokens then true 1520 else 1521 (* Check if any simple key could affect the first queued token *) 1522 List.exists (function 1523 | Some sk when sk.sk_possible -> 1524 sk.sk_token_number >= t.tokens_taken 1525 | _ -> false 1526 ) t.simple_keys 1527 1528(** Ensure we have enough tokens to return one safely *) 1529let ensure_tokens t = 1530 if not t.stream_started then begin 1531 t.stream_started <- true; 1532 let span = Span.point (Input.position t.input) in 1533 let encoding, _ = Encoding.detect (Input.source t.input) in 1534 emit t span (Token.Stream_start encoding) 1535 end; 1536 while need_more_tokens t do 1537 fetch_next_token t 1538 done 1539 1540(** Get next token *) 1541let next t = 1542 ensure_tokens t; 1543 if Queue.is_empty t.tokens then 1544 None 1545 else begin 1546 t.tokens_taken <- t.tokens_taken + 1; 1547 Some (Queue.pop t.tokens) 1548 end 1549 1550(** Peek at next token *) 1551let peek t = 1552 ensure_tokens t; 1553 Queue.peek_opt t.tokens 1554 1555(** Iterate over all tokens *) 1556let iter f t = 1557 let rec loop () = 1558 match next t with 1559 | None -> () 1560 | Some tok -> f tok; loop () 1561 in 1562 loop () 1563 1564(** Fold over all tokens *) 1565let fold f init t = 1566 let rec loop acc = 1567 match next t with 1568 | None -> acc 1569 | Some tok -> loop (f acc tok) 1570 in 1571 loop init 1572 1573(** Convert to list *) 1574let to_list t = 1575 fold (fun acc tok -> tok :: acc) [] t |> List.rev