Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *) 7 8(** Simple key tracking for mapping key disambiguation *) 9type simple_key = { 10 sk_possible : bool; 11 sk_required : bool; 12 sk_token_number : int; 13 sk_position : Position.t; 14} 15 16(** Indent level tracking *) 17type indent = { 18 indent : int; 19 needs_block_end : bool; 20} 21 22type t = { 23 input : Input.t; 24 tokens : Token.spanned Queue.t; 25 mutable token_number : int; 26 mutable tokens_taken : int; 27 mutable stream_started : bool; 28 mutable stream_ended : bool; 29 mutable indent_stack : indent list; (** Stack of indentation levels *) 30 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *) 31 mutable flow_indent : int; (** Column where outermost flow collection started *) 32 mutable simple_keys : simple_key option list; (** Per flow-level simple key tracking *) 33 mutable allow_simple_key : bool; 34 mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *) 35 mutable document_has_content : bool; (** True if we've emitted content tokens in current document *) 36 mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *) 37 mutable flow_mapping_stack : bool list; (** Stack of whether each flow level is a mapping *) 38} 39 40let create input = 41 { 42 input; 43 tokens = Queue.create (); 44 token_number = 0; 45 tokens_taken = 0; 46 stream_started = false; 47 stream_ended = false; 48 indent_stack = []; 49 flow_level = 0; 50 flow_indent = 0; 51 simple_keys = [None]; (* One entry for the base level *) 52 allow_simple_key = true; 53 leading_whitespace = true; (* Start at beginning of stream *) 54 document_has_content = false; 55 adjacent_value_allowed_at = None; 56 flow_mapping_stack = []; 57 } 58 59let of_string s = create (Input.of_string s) 60let of_input = create 61let of_reader r = create (Input.of_reader r) 62 63let position t = Input.position t.input 64 65(** Add a token to the queue *) 66let emit t span token = 67 Queue.add { Token.token; span } t.tokens; 68 t.token_number <- t.token_number + 1 69 70(** Get current column (1-indexed) *) 71let column t = (Input.position t.input).column 72 73(** Get current indent level *) 74let current_indent t = 75 match t.indent_stack with 76 | [] -> -1 77 | { indent; _ } :: _ -> indent 78 79(** Skip whitespace to end of line, checking for valid comments. 80 Returns true if any whitespace (including tabs) was found before a comment. *) 81let skip_whitespace_and_comment t = 82 let has_whitespace = ref false in 83 (* Skip blanks (spaces and tabs) *) 84 while Input.next_is_blank t.input do 85 has_whitespace := true; 86 ignore (Input.next t.input) 87 done; 88 (* Check for comment *) 89 if Input.next_is (( = ) '#') t.input then begin 90 (* Validate: comment must be preceded by whitespace or be at start of line *) 91 if not !has_whitespace then begin 92 (* Check if we're at the start of input or after whitespace (blank or line break) *) 93 match Input.peek_back t.input with 94 | None -> () (* Start of input - OK *) 95 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *) 96 | _ -> 97 (* Comment not preceded by whitespace - ERROR *) 98 Error.raise_at (Input.mark t.input) Invalid_comment 99 end; 100 (* Skip to end of line *) 101 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 102 ignore (Input.next t.input) 103 done 104 end 105 106(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *) 107let skip_blanks_check_tabs t = 108 let found_tab = ref false in 109 let found_space = ref false in 110 while Input.next_is_blank t.input do 111 (match Input.peek t.input with 112 | Some '\t' -> found_tab := true 113 | Some ' ' -> found_space := true 114 | _ -> ()); 115 ignore (Input.next t.input) 116 done; 117 (!found_tab, !found_space) 118 119(** Skip whitespace and comments, return true if at newline *) 120let rec skip_to_next_token t = 121 (* Check for tabs used as indentation in block context *) 122 (match Input.peek t.input with 123 | Some '\t' when t.flow_level = 0 && t.leading_whitespace && 124 (column t - 1) < current_indent t -> 125 (* Tab found in indentation zone - this is invalid *) 126 (* Skip to end of line to check if line has content *) 127 let start_pos = Input.mark t.input in 128 while Input.next_is_blank t.input do 129 ignore (Input.next t.input) 130 done; 131 (* If we have content on this line with a tab, raise error *) 132 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then 133 Error.raise_at start_pos Tab_in_indentation 134 | _ -> ()); 135 136 (* Skip blanks and validate comments *) 137 skip_whitespace_and_comment t; 138 (* Skip line break in block context *) 139 if t.flow_level = 0 && Input.next_is_break t.input then begin 140 Input.consume_break t.input; 141 t.allow_simple_key <- true; 142 t.leading_whitespace <- true; 143 skip_to_next_token t 144 end 145 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin 146 (* In flow context, skip all whitespace including line breaks *) 147 if Input.next_is_break t.input then begin 148 Input.consume_break t.input; 149 (* Allow simple keys after line breaks in flow context *) 150 t.allow_simple_key <- true; 151 (* After line break in flow, check for tabs at start of line (Y79Y/03) 152 Tabs are not allowed as indentation - if tab is first char and results 153 in a column less than flow_indent, it's an error *) 154 if Input.next_is (( = ) '\t') t.input then begin 155 (* Tab at start of line in flow context - skip tabs and check position *) 156 let start_mark = Input.mark t.input in 157 while Input.next_is (( = ) '\t') t.input do 158 ignore (Input.next t.input) 159 done; 160 (* If only tabs were used (no spaces) and column < flow_indent, error *) 161 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) && 162 column t < t.flow_indent then 163 Error.raise_at start_mark Invalid_flow_indentation 164 end; 165 skip_to_next_token t 166 end else begin 167 ignore (Input.next t.input); 168 skip_to_next_token t 169 end 170 end 171 172(** Roll the indentation level *) 173let roll_indent t col = 174 if t.flow_level = 0 && col > current_indent t then begin 175 t.indent_stack <- { indent = col; needs_block_end = true } :: t.indent_stack; 176 true 177 end else 178 false 179 180(** Unroll indentation to given column *) 181let unroll_indent t col = 182 while t.flow_level = 0 && 183 match t.indent_stack with 184 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true 185 | _ -> false 186 do 187 match t.indent_stack with 188 | { indent = _; needs_block_end = true; _ } :: rest -> 189 let pos = Input.position t.input in 190 let span = Span.point pos in 191 emit t span Token.Block_end; 192 t.indent_stack <- rest 193 | _ -> () 194 done 195 196(** Save a potential simple key *) 197let save_simple_key t = 198 if t.allow_simple_key then begin 199 (* A simple key is required only if we're in a block context, 200 at the current indentation level, AND the current indent needs a block end. 201 This matches saphyr's logic and prevents false positives for values. *) 202 let required = t.flow_level = 0 && 203 match t.indent_stack with 204 | { indent; needs_block_end = true; _ } :: _ -> 205 indent = column t 206 | _ -> false 207 in 208 let sk = { 209 sk_possible = true; 210 sk_required = required; 211 sk_token_number = t.token_number; 212 sk_position = Input.position t.input; 213 } in 214 (* Remove any existing simple key at current level *) 215 t.simple_keys <- ( 216 match t.simple_keys with 217 | _ :: rest -> Some sk :: rest 218 | [] -> [Some sk] 219 ) 220 end 221 222(** Remove simple key at current level *) 223let remove_simple_key t = 224 match t.simple_keys with 225 | Some sk :: _rest when sk.sk_required -> 226 Error.raise_at sk.sk_position Expected_key 227 | _ :: rest -> t.simple_keys <- None :: rest 228 | [] -> () 229 230(** Stale simple keys that span too many tokens *) 231let stale_simple_keys t = 232 t.simple_keys <- List.map (fun sk_opt -> 233 match sk_opt with 234 | Some sk when sk.sk_possible && 235 (Input.position t.input).line > sk.sk_position.line && 236 t.flow_level = 0 -> 237 if sk.sk_required then 238 Error.raise_at sk.sk_position Expected_key; 239 None 240 | _ -> sk_opt 241 ) t.simple_keys 242 243(** Read anchor or alias name *) 244let scan_anchor_alias t = 245 let start = Input.mark t.input in 246 let buf = Buffer.create 16 in 247 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT: 248 - Whitespace (space, tab, line breaks) 249 - Flow indicators: []{} 250 - Comma (,) 251 This matches the saphyr implementation: is_yaml_non_space && !is_flow *) 252 while 253 match Input.peek t.input with 254 | Some c when not (Input.is_whitespace c) && 255 not (Input.is_flow_indicator c) && 256 c <> '\x00' -> 257 Buffer.add_char buf c; 258 ignore (Input.next t.input); 259 true 260 | _ -> false 261 do () done; 262 let name = Buffer.contents buf in 263 if String.length name = 0 then 264 Error.raise_at start (Invalid_anchor "empty anchor name"); 265 (name, Span.make ~start ~stop:(Input.mark t.input)) 266 267(** Scan tag handle *) 268let scan_tag_handle t = 269 let start = Input.mark t.input in 270 let buf = Buffer.create 16 in 271 (* Expect ! *) 272 (match Input.peek t.input with 273 | Some '!' -> 274 Buffer.add_char buf '!'; 275 ignore (Input.next t.input) 276 | _ -> Error.raise_at start (Invalid_tag "expected '!'")); 277 (* Read word chars *) 278 while 279 match Input.peek t.input with 280 | Some c when Input.is_alnum c || c = '-' -> 281 Buffer.add_char buf c; 282 ignore (Input.next t.input); 283 true 284 | _ -> false 285 do () done; 286 (* Check for secondary ! *) 287 (match Input.peek t.input with 288 | Some '!' -> 289 Buffer.add_char buf '!'; 290 ignore (Input.next t.input) 291 | _ -> ()); 292 Buffer.contents buf 293 294(** Scan tag suffix (after handle) *) 295let scan_tag_suffix t = 296 let is_hex_digit c = 297 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') 298 in 299 let hex_val c = 300 match c with 301 | '0'..'9' -> Char.code c - Char.code '0' 302 | 'A'..'F' -> Char.code c - Char.code 'A' + 10 303 | 'a'..'f' -> Char.code c - Char.code 'a' + 10 304 | _ -> 0 305 in 306 let buf = Buffer.create 32 in 307 while 308 match Input.peek t.input with 309 | Some '%' -> 310 (* Percent-encoded character *) 311 ignore (Input.next t.input); 312 (match Input.peek t.input, Input.peek_nth t.input 1 with 313 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 -> 314 ignore (Input.next t.input); 315 ignore (Input.next t.input); 316 let code = (hex_val c1) * 16 + (hex_val c2) in 317 Buffer.add_char buf (Char.chr code); 318 true 319 | _ -> 320 (* Invalid percent encoding - keep the % *) 321 Buffer.add_char buf '%'; 322 true) 323 | Some c when not (Input.is_whitespace c) && 324 not (Input.is_flow_indicator c) -> 325 Buffer.add_char buf c; 326 ignore (Input.next t.input); 327 true 328 | _ -> false 329 do () done; 330 Buffer.contents buf 331 332(** Scan a tag *) 333let scan_tag t = 334 let start = Input.mark t.input in 335 ignore (Input.next t.input); (* consume ! *) 336 let handle, suffix = 337 match Input.peek t.input with 338 | Some '<' -> 339 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *) 340 ignore (Input.next t.input); 341 let buf = Buffer.create 32 in 342 while 343 match Input.peek t.input with 344 | Some '>' -> false 345 | Some c -> 346 Buffer.add_char buf c; 347 ignore (Input.next t.input); 348 true 349 | None -> Error.raise_at (Input.mark t.input) (Invalid_tag "unclosed verbatim tag") 350 do () done; 351 ignore (Input.next t.input); (* consume > *) 352 ("", Buffer.contents buf) 353 | Some c when Input.is_whitespace c || Input.is_flow_indicator c -> 354 (* Non-specific tag: ! *) 355 ("!", "") 356 | Some '!' -> 357 (* Secondary handle: !! *) 358 ignore (Input.next t.input); (* consume second ! *) 359 let suffix = scan_tag_suffix t in 360 ("!!", suffix) 361 | _ -> 362 (* Primary handle or just suffix: !foo or !e!foo *) 363 (* Read alphanumeric characters *) 364 let buf = Buffer.create 16 in 365 while 366 match Input.peek t.input with 367 | Some c when Input.is_alnum c || c = '-' -> 368 Buffer.add_char buf c; 369 ignore (Input.next t.input); 370 true 371 | _ -> false 372 do () done; 373 (* Check if next character is ! - if so, this is a named handle *) 374 (match Input.peek t.input with 375 | Some '!' -> 376 (* Named handle like !e! *) 377 ignore (Input.next t.input); 378 let handle_name = Buffer.contents buf in 379 let suffix = scan_tag_suffix t in 380 ("!" ^ handle_name ^ "!", suffix) 381 | _ -> 382 (* Just ! followed by suffix *) 383 ("!", Buffer.contents buf ^ scan_tag_suffix t)) 384 in 385 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *) 386 (match Input.peek t.input with 387 | None -> () (* EOF is ok *) 388 | Some c when Input.is_whitespace c || Input.is_break c -> () 389 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> () 390 | _ -> Error.raise_at start (Invalid_tag "expected whitespace or line break after tag")); 391 let span = Span.make ~start ~stop:(Input.mark t.input) in 392 (handle, suffix, span) 393 394(** Scan single-quoted scalar *) 395let scan_single_quoted t = 396 let start = Input.mark t.input in 397 ignore (Input.next t.input); (* consume opening single-quote *) 398 let buf = Buffer.create 64 in 399 let whitespace = Buffer.create 16 in (* Track trailing whitespace *) 400 401 let flush_whitespace () = 402 if Buffer.length whitespace > 0 then begin 403 Buffer.add_buffer buf whitespace; 404 Buffer.clear whitespace 405 end 406 in 407 408 let rec loop () = 409 match Input.peek t.input with 410 | None -> Error.raise_at start Unclosed_single_quote 411 | Some '\'' -> 412 ignore (Input.next t.input); 413 (* Check for escaped quote ('') *) 414 (match Input.peek t.input with 415 | Some '\'' -> 416 flush_whitespace (); 417 Buffer.add_char buf '\''; 418 ignore (Input.next t.input); 419 loop () 420 | _ -> 421 (* End of string - flush any trailing whitespace *) 422 flush_whitespace ()) 423 | Some ' ' | Some '\t' -> 424 (* Track whitespace - don't add to buf yet *) 425 Buffer.add_char whitespace (Option.get (Input.peek t.input)); 426 ignore (Input.next t.input); 427 loop () 428 | Some '\n' | Some '\r' -> 429 (* Discard trailing whitespace before line break *) 430 Buffer.clear whitespace; 431 Input.consume_break t.input; 432 (* Skip leading whitespace on next line *) 433 while Input.next_is_blank t.input do 434 ignore (Input.next t.input) 435 done; 436 (* Check for document boundary *) 437 if Input.at_document_boundary t.input then 438 Error.raise_at start Unclosed_single_quote; 439 (* Check indentation: continuation must be > block indent (QB6E, DK95) *) 440 let col = column t in 441 let indent = current_indent t in 442 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then 443 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 444 (* Count empty lines (consecutive line breaks) *) 445 let empty_lines = ref 0 in 446 while Input.next_is_break t.input do 447 incr empty_lines; 448 Input.consume_break t.input; 449 while Input.next_is_blank t.input do 450 ignore (Input.next t.input) 451 done; 452 if Input.at_document_boundary t.input then 453 Error.raise_at start Unclosed_single_quote; 454 (* Check indentation after each empty line too *) 455 let col = column t in 456 let indent = current_indent t in 457 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then 458 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar") 459 done; 460 (* Apply folding rules *) 461 if !empty_lines > 0 then begin 462 (* Empty lines: preserve as newlines *) 463 for _ = 1 to !empty_lines do 464 Buffer.add_char buf '\n' 465 done 466 end else 467 (* Single break: fold to space (even at start of string) *) 468 Buffer.add_char buf ' '; 469 loop () 470 | Some c -> 471 flush_whitespace (); 472 Buffer.add_char buf c; 473 ignore (Input.next t.input); 474 loop () 475 in 476 loop (); 477 let span = Span.make ~start ~stop:(Input.mark t.input) in 478 (Buffer.contents buf, span) 479 480(** Decode hex escape of given length *) 481let decode_hex t len = 482 let start = Input.mark t.input in 483 let buf = Buffer.create len in 484 for _ = 1 to len do 485 match Input.peek t.input with 486 | Some c when Input.is_hex c -> 487 Buffer.add_char buf c; 488 ignore (Input.next t.input) 489 | _ -> 490 Error.raise_at start (Invalid_hex_escape (Buffer.contents buf)) 491 done; 492 let code = int_of_string ("0x" ^ Buffer.contents buf) in 493 if code <= 0x7F then 494 String.make 1 (Char.chr code) 495 else if code <= 0x7FF then 496 let b1 = 0xC0 lor (code lsr 6) in 497 let b2 = 0x80 lor (code land 0x3F) in 498 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2)) 499 else if code <= 0xFFFF then 500 let b1 = 0xE0 lor (code lsr 12) in 501 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in 502 let b3 = 0x80 lor (code land 0x3F) in 503 String.init 3 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3)) 504 else 505 let b1 = 0xF0 lor (code lsr 18) in 506 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in 507 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in 508 let b4 = 0x80 lor (code land 0x3F) in 509 String.init 4 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4)) 510 511(** Scan double-quoted scalar *) 512let scan_double_quoted t = 513 let start = Input.mark t.input in 514 ignore (Input.next t.input); (* consume opening double-quote *) 515 let buf = Buffer.create 64 in 516 let whitespace = Buffer.create 16 in (* Track pending whitespace *) 517 518 let flush_whitespace () = 519 if Buffer.length whitespace > 0 then begin 520 Buffer.add_buffer buf whitespace; 521 Buffer.clear whitespace 522 end 523 in 524 525 let rec loop () = 526 match Input.peek t.input with 527 | None -> Error.raise_at start Unclosed_double_quote 528 | Some '"' -> 529 (* Flush trailing whitespace before closing quote to preserve it *) 530 flush_whitespace (); 531 ignore (Input.next t.input) 532 | Some ' ' | Some '\t' as c_opt -> 533 (* Track whitespace - don't add to buf yet *) 534 let c = match c_opt with Some c -> c | None -> assert false in 535 Buffer.add_char whitespace c; 536 ignore (Input.next t.input); 537 loop () 538 | Some '\\' -> 539 (* Escape sequence - this is non-whitespace content *) 540 flush_whitespace (); (* Commit any pending whitespace *) 541 ignore (Input.next t.input); 542 (match Input.peek t.input with 543 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>") 544 | Some '0' -> Buffer.add_char buf '\x00'; ignore (Input.next t.input) 545 | Some 'a' -> Buffer.add_char buf '\x07'; ignore (Input.next t.input) 546 | Some 'b' -> Buffer.add_char buf '\x08'; ignore (Input.next t.input) 547 | Some 't' | Some '\t' -> Buffer.add_char buf '\t'; ignore (Input.next t.input) 548 | Some 'n' -> Buffer.add_char buf '\n'; ignore (Input.next t.input) 549 | Some 'v' -> Buffer.add_char buf '\x0B'; ignore (Input.next t.input) 550 | Some 'f' -> Buffer.add_char buf '\x0C'; ignore (Input.next t.input) 551 | Some 'r' -> Buffer.add_char buf '\r'; ignore (Input.next t.input) 552 | Some 'e' -> Buffer.add_char buf '\x1B'; ignore (Input.next t.input) 553 | Some ' ' -> Buffer.add_char buf ' '; ignore (Input.next t.input) 554 | Some '"' -> Buffer.add_char buf '"'; ignore (Input.next t.input) 555 | Some '/' -> Buffer.add_char buf '/'; ignore (Input.next t.input) 556 | Some '\\' -> Buffer.add_char buf '\\'; ignore (Input.next t.input) 557 | Some 'N' -> Buffer.add_string buf "\xC2\x85"; ignore (Input.next t.input) (* NEL *) 558 | Some '_' -> Buffer.add_string buf "\xC2\xA0"; ignore (Input.next t.input) (* NBSP *) 559 | Some 'L' -> Buffer.add_string buf "\xE2\x80\xA8"; ignore (Input.next t.input) (* LS *) 560 | Some 'P' -> Buffer.add_string buf "\xE2\x80\xA9"; ignore (Input.next t.input) (* PS *) 561 | Some 'x' -> 562 ignore (Input.next t.input); 563 Buffer.add_string buf (decode_hex t 2) 564 | Some 'u' -> 565 ignore (Input.next t.input); 566 Buffer.add_string buf (decode_hex t 4) 567 | Some 'U' -> 568 ignore (Input.next t.input); 569 Buffer.add_string buf (decode_hex t 8) 570 | Some '\n' | Some '\r' -> 571 (* Line continuation escape *) 572 Input.consume_break t.input; 573 while Input.next_is_blank t.input do 574 ignore (Input.next t.input) 575 done 576 | Some c -> 577 Error.raise_at (Input.mark t.input) 578 (Invalid_escape_sequence (Printf.sprintf "\\%c" c))); 579 loop () 580 | Some '\n' | Some '\r' -> 581 (* Line break: discard any pending trailing whitespace *) 582 Buffer.clear whitespace; 583 Input.consume_break t.input; 584 (* Count consecutive line breaks (empty lines) *) 585 let empty_lines = ref 0 in 586 let continue = ref true in 587 let started_with_tab = ref false in 588 while !continue do 589 (* Track if we start with a tab (for DK95/01 check) *) 590 if Input.next_is (( = ) '\t') t.input then started_with_tab := true; 591 (* Skip blanks (spaces/tabs) on the line *) 592 while Input.next_is_blank t.input do 593 ignore (Input.next t.input) 594 done; 595 (* Check if we hit another line break (empty line) *) 596 if Input.next_is_break t.input then begin 597 Input.consume_break t.input; 598 incr empty_lines; 599 started_with_tab := false (* Reset for next line *) 600 end else 601 continue := false 602 done; 603 (* Check for document boundary - this terminates the quoted string *) 604 if Input.at_document_boundary t.input then 605 Error.raise_at start Unclosed_double_quote; 606 (* Check indentation: continuation must be > block indent (QB6E, DK95) 607 Note: must be strictly greater than block indent, not just equal *) 608 let col = column t in 609 let indent = current_indent t in 610 let start_col = start.column in 611 (* DK95/01: if continuation started with tabs and column < start column, error *) 612 if not (Input.is_eof t.input) && !started_with_tab && col < start_col then 613 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 614 if not (Input.is_eof t.input) && col <= indent && indent >= 0 then 615 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar"); 616 (* Per YAML spec: single break = space, break + empty lines = newlines *) 617 if !empty_lines > 0 then begin 618 (* Empty lines: output N newlines where N = number of empty lines *) 619 for _ = 1 to !empty_lines do 620 Buffer.add_char buf '\n' 621 done 622 end else 623 (* Single break folds to space *) 624 Buffer.add_char buf ' '; 625 loop () 626 | Some c -> 627 (* Non-whitespace character *) 628 flush_whitespace (); (* Commit any pending whitespace *) 629 Buffer.add_char buf c; 630 ignore (Input.next t.input); 631 loop () 632 in 633 loop (); 634 let span = Span.make ~start ~stop:(Input.mark t.input) in 635 (Buffer.contents buf, span) 636 637(** Check if character can appear in plain scalar at this position *) 638let can_continue_plain t c ~in_flow = 639 match c with 640 | ':' -> 641 (* : is OK if not followed by whitespace or flow indicator *) 642 (match Input.peek_nth t.input 1 with 643 | None -> true 644 | Some c2 when Input.is_whitespace c2 -> false 645 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false 646 | _ -> true) 647 | '#' -> 648 (* # is a comment indicator only if preceded by whitespace *) 649 (* Check the previous character to determine if this is a comment *) 650 (match Input.peek_back t.input with 651 | None -> true (* At start - can't be comment indicator, allow it *) 652 | Some c when Input.is_whitespace c -> false (* Preceded by whitespace - comment *) 653 | Some c when Input.is_break c -> false (* At start of line - comment *) 654 | _ -> true) (* Not preceded by whitespace - part of scalar *) 655 | c when in_flow && Input.is_flow_indicator c -> false 656 | _ when Input.is_break c -> false 657 | _ -> true 658 659(** Scan plain scalar *) 660let scan_plain_scalar t = 661 let start = Input.mark t.input in 662 let in_flow = t.flow_level > 0 in 663 let indent = current_indent t in 664 (* In flow context, scalars must be indented more than the current block indent. 665 This ensures that content at block indent or less ends the flow context. *) 666 if in_flow && (column t - 1) < indent then 667 Error.raise_at start Invalid_flow_indentation; 668 let buf = Buffer.create 64 in 669 let spaces = Buffer.create 16 in 670 let whitespace = Buffer.create 16 in (* Track whitespace within a line *) 671 let leading_blanks = ref false in 672 673 let rec scan_line () = 674 match Input.peek t.input with 675 | None -> () 676 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow -> 677 (* Blank character within a line - save to whitespace buffer *) 678 Buffer.add_char whitespace c; 679 ignore (Input.next t.input); 680 scan_line () 681 | Some c when can_continue_plain t c ~in_flow -> 682 (* Non-blank character - process any pending breaks/whitespace first *) 683 begin 684 if Buffer.length spaces > 0 then begin 685 if !leading_blanks then begin 686 (* Fold line break *) 687 if Buffer.contents spaces = "\n" then 688 Buffer.add_char buf ' ' 689 else begin 690 (* Multiple breaks - preserve all but first *) 691 let s = Buffer.contents spaces in 692 Buffer.add_substring buf s 1 (String.length s - 1) 693 end 694 end else 695 Buffer.add_buffer buf spaces; 696 Buffer.clear spaces 697 end; 698 (* Add any pending whitespace from within the line *) 699 if Buffer.length whitespace > 0 then begin 700 Buffer.add_buffer buf whitespace; 701 Buffer.clear whitespace 702 end; 703 (* Add the character *) 704 Buffer.add_char buf c; 705 ignore (Input.next t.input); 706 leading_blanks := false; 707 scan_line () 708 end 709 | _ -> () 710 in 711 712 let rec scan_lines () = 713 scan_line (); 714 (* Check for line continuation *) 715 if Input.next_is_break t.input then begin 716 (* Discard any trailing whitespace from the current line *) 717 Buffer.clear whitespace; 718 (* Save the line break *) 719 if !leading_blanks then begin 720 (* We already had a break - this is an additional break (empty line) *) 721 Buffer.add_char spaces '\n' 722 end else begin 723 (* First line break *) 724 Buffer.clear spaces; 725 Buffer.add_char spaces '\n'; 726 leading_blanks := true 727 end; 728 Input.consume_break t.input; 729 (* Note: We do NOT set allow_simple_key here during plain scalar scanning. 730 Setting it here would incorrectly allow ':' that appears on a continuation 731 line to become a mapping indicator. The flag will be set properly after 732 the scalar ends and skip_to_next_token processes line breaks. *) 733 (* Skip leading blanks on the next line *) 734 while Input.next_is_blank t.input do 735 ignore (Input.next t.input) 736 done; 737 let col = (Input.position t.input).column in 738 (* Check indentation - stop if we're at or before the containing block's indent *) 739 (* However, allow empty lines (line breaks) to continue even if dedented *) 740 if Input.next_is_break t.input then 741 scan_lines () (* Empty line - continue *) 742 else if not in_flow && col <= indent then 743 () (* Stop - dedented or at parent level in block context *) 744 else if Input.at_document_boundary t.input then 745 () (* Stop - document boundary *) 746 else 747 scan_lines () 748 end 749 in 750 751 scan_lines (); 752 let value = Buffer.contents buf in 753 (* Trim trailing whitespace (spaces and tabs) *) 754 let value = 755 let len = String.length value in 756 let rec find_end i = 757 if i < 0 then 0 758 else match value.[i] with 759 | ' ' | '\t' -> find_end (i - 1) 760 | _ -> i + 1 761 in 762 let end_pos = find_end (len - 1) in 763 String.sub value 0 end_pos 764 in 765 let span = Span.make ~start ~stop:(Input.mark t.input) in 766 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *) 767 (value, span, !leading_blanks) 768 769(** Scan block scalar (literal | or folded >) *) 770let scan_block_scalar t literal = 771 let start = Input.mark t.input in 772 ignore (Input.next t.input); (* consume | or > *) 773 774 (* Parse header: optional indentation indicator and chomping *) 775 let explicit_indent = ref None in 776 let chomping = ref Chomping.Clip in 777 778 (* First character of header *) 779 (match Input.peek t.input with 780 | Some c when Input.is_digit c && c <> '0' -> 781 explicit_indent := Some (Char.code c - Char.code '0'); 782 ignore (Input.next t.input) 783 | Some '-' -> chomping := Chomping.Strip; ignore (Input.next t.input) 784 | Some '+' -> chomping := Chomping.Keep; ignore (Input.next t.input) 785 | _ -> ()); 786 787 (* Second character of header *) 788 (match Input.peek t.input with 789 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None -> 790 explicit_indent := Some (Char.code c - Char.code '0'); 791 ignore (Input.next t.input) 792 | Some '-' when !chomping = Chomping.Clip -> 793 chomping := Chomping.Strip; ignore (Input.next t.input) 794 | Some '+' when !chomping = Chomping.Clip -> 795 chomping := Chomping.Keep; ignore (Input.next t.input) 796 | _ -> ()); 797 798 (* Skip whitespace and optional comment *) 799 skip_whitespace_and_comment t; 800 801 (* Consume line break *) 802 if Input.next_is_break t.input then 803 Input.consume_break t.input 804 else if not (Input.is_eof t.input) then 805 Error.raise_at (Input.mark t.input) 806 (Invalid_block_scalar_header "expected newline after header"); 807 808 let base_indent = current_indent t in 809 (* base_indent is the indent level from the stack, -1 if empty. 810 It's used directly for comparisons in implicit indent case. *) 811 let content_indent = ref ( 812 match !explicit_indent with 813 | Some n -> 814 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed. 815 content_indent = (base_indent - 1) + n, but at least n for document level. *) 816 let base_level = max 0 (base_indent - 1) in 817 base_level + n 818 | None -> 0 (* Will be determined by first non-empty line *) 819 ) in 820 821 let buf = Buffer.create 256 in 822 let trailing_breaks = Buffer.create 16 in 823 let leading_blank = ref false in (* Was the previous line "more indented"? *) 824 let max_empty_line_indent = ref 0 in (* Track max indent of empty lines before first content *) 825 826 (* Skip to content indentation, skipping empty lines. 827 Returns the number of spaces actually skipped (important for detecting dedentation). *) 828 let rec skip_to_content_indent () = 829 if !content_indent > 0 then begin 830 (* Explicit indent - skip up to content_indent spaces *) 831 let spaces_skipped = ref 0 in 832 while !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input do 833 incr spaces_skipped; 834 ignore (Input.next t.input) 835 done; 836 837 (* Check if this line is empty (only spaces/tabs until break/eof) *) 838 if Input.next_is_break t.input then begin 839 (* Empty line - record the break and continue *) 840 Buffer.add_char trailing_breaks '\n'; 841 Input.consume_break t.input; 842 skip_to_content_indent () 843 end else if !spaces_skipped < !content_indent then begin 844 (* Line starts with fewer spaces than content_indent - dedented *) 845 !spaces_skipped 846 end else if Input.next_is_blank t.input then begin 847 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line. 848 For literal scalars, whitespace-only lines ARE content (not empty). 849 For folded scalars, whitespace-only lines that are "more indented" are preserved. *) 850 if literal then 851 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *) 852 !content_indent 853 else begin 854 (* Folded: check if rest is only blanks *) 855 let idx = ref 0 in 856 while match Input.peek_nth t.input !idx with 857 | Some c when Input.is_blank c -> incr idx; true 858 | _ -> false 859 do () done; 860 match Input.peek_nth t.input (!idx) with 861 | None | Some '\n' | Some '\r' -> 862 (* Empty/whitespace-only line in folded - skip spaces *) 863 while Input.next_is_blank t.input do 864 ignore (Input.next t.input) 865 done; 866 Buffer.add_char trailing_breaks '\n'; 867 Input.consume_break t.input; 868 skip_to_content_indent () 869 | _ -> 870 (* Has non-whitespace content *) 871 !content_indent 872 end 873 end else 874 !content_indent 875 end else begin 876 (* Implicit indent - skip empty lines without consuming spaces. 877 Note: Only SPACES count as indentation. Tabs are content, not indentation. 878 So we only check for spaces when determining if a line is "empty". *) 879 if Input.next_is_break t.input then begin 880 Buffer.add_char trailing_breaks '\n'; 881 Input.consume_break t.input; 882 skip_to_content_indent () 883 end else if Input.next_is (( = ) ' ') t.input then begin 884 (* Check if line is empty (only spaces before break) *) 885 let idx = ref 0 in 886 while match Input.peek_nth t.input !idx with 887 | Some ' ' -> incr idx; true 888 | _ -> false 889 do () done; 890 match Input.peek_nth t.input (!idx) with 891 | None | Some '\n' | Some '\r' -> 892 (* Line has only spaces - empty line *) 893 (* Track max indent of empty lines for later validation *) 894 if !idx > !max_empty_line_indent then 895 max_empty_line_indent := !idx; 896 while Input.next_is (( = ) ' ') t.input do 897 ignore (Input.next t.input) 898 done; 899 Buffer.add_char trailing_breaks '\n'; 900 Input.consume_break t.input; 901 skip_to_content_indent () 902 | _ -> 903 (* Has content (including tabs which are content, not indentation) *) 904 0 905 end else if Input.next_is (( = ) '\t') t.input then begin 906 (* Tab at start of line in implicit indent mode - this is an error (Y79Y) 907 because tabs cannot be used as indentation in YAML *) 908 Error.raise_at (Input.mark t.input) Tab_in_indentation 909 end else 910 (* Not at break or space - other content character *) 911 0 912 end 913 in 914 915 (* Read content *) 916 let rec read_lines () = 917 let spaces_skipped = skip_to_content_indent () in 918 919 (* Check if we're at content *) 920 if Input.is_eof t.input then () 921 else if Input.at_document_boundary t.input then () 922 else begin 923 (* Count additional leading spaces beyond what was skipped *) 924 let extra_spaces = ref 0 in 925 while Input.next_is (( = ) ' ') t.input do 926 incr extra_spaces; 927 ignore (Input.next t.input) 928 done; 929 930 (* Calculate actual line indentation *) 931 let line_indent = spaces_skipped + !extra_spaces in 932 933 (* Determine content indent from first content line (implicit case) *) 934 let first_line = !content_indent = 0 in 935 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent. 936 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *) 937 let base_level = base_indent - 1 in 938 let should_process = 939 if !content_indent = 0 then begin 940 (* For implicit indent, content must be more indented than base_level. *) 941 if line_indent <= base_level then 942 false (* No content - first line not indented enough *) 943 else begin 944 (* Validate: first content line must be indented at least as much as 945 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *) 946 if line_indent < !max_empty_line_indent && line_indent > base_level then 947 Error.raise_at (Input.mark t.input) 948 (Invalid_block_scalar_header "wrongly indented line in block scalar"); 949 content_indent := line_indent; 950 true 951 end 952 end else if line_indent < !content_indent then 953 false (* Dedented - done with content *) 954 else 955 true 956 in 957 958 if should_process then begin 959 (* Check if current line is "more indented" (has extra indent or starts with whitespace). 960 For folded scalars, lines that start with any whitespace (space or tab) after the 961 content indentation are "more indented" and preserve breaks. 962 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *) 963 let trailing_blank = line_indent > !content_indent || Input.next_is_blank t.input in 964 965 (* Add trailing breaks to buffer *) 966 if Buffer.length buf > 0 then begin 967 if Buffer.length trailing_breaks > 0 then begin 968 if literal then 969 Buffer.add_buffer buf trailing_breaks 970 else begin 971 (* Folded scalar: fold only if both previous and current lines are not more-indented *) 972 if not !leading_blank && not trailing_blank then begin 973 let breaks = Buffer.contents trailing_breaks in 974 if String.length breaks = 1 then 975 Buffer.add_char buf ' ' 976 else 977 Buffer.add_substring buf breaks 1 (String.length breaks - 1) 978 end else begin 979 (* Preserve breaks for more-indented lines *) 980 Buffer.add_buffer buf trailing_breaks 981 end 982 end 983 end else if not literal then 984 Buffer.add_char buf ' ' 985 end else 986 Buffer.add_buffer buf trailing_breaks; 987 Buffer.clear trailing_breaks; 988 989 (* Add extra indentation for literal or more-indented folded lines *) 990 (* On the first line (when determining content_indent), we've already consumed all spaces, 991 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *) 992 if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin 993 for _ = 1 to !extra_spaces do 994 Buffer.add_char buf ' ' 995 done 996 end; 997 998 (* Read line content *) 999 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 1000 Buffer.add_char buf (Input.next_exn t.input) 1001 done; 1002 1003 (* Record trailing break *) 1004 if Input.next_is_break t.input then begin 1005 Buffer.add_char trailing_breaks '\n'; 1006 Input.consume_break t.input 1007 end; 1008 1009 (* Update leading_blank for next iteration *) 1010 leading_blank := trailing_blank; 1011 1012 read_lines () 1013 end 1014 end 1015 in 1016 1017 read_lines (); 1018 1019 (* Apply chomping *) 1020 let value = 1021 let content = Buffer.contents buf in 1022 match !chomping with 1023 | Chomping.Strip -> content 1024 | Chomping.Clip -> 1025 if String.length content > 0 then content ^ "\n" else content 1026 | Chomping.Keep -> 1027 content ^ Buffer.contents trailing_breaks 1028 in 1029 1030 let span = Span.make ~start ~stop:(Input.mark t.input) in 1031 let style = if literal then `Literal else `Folded in 1032 (value, style, span) 1033 1034(** Scan directive (after %) *) 1035let scan_directive t = 1036 let start = Input.mark t.input in 1037 ignore (Input.next t.input); (* consume % *) 1038 1039 (* Read directive name *) 1040 let name_buf = Buffer.create 16 in 1041 while 1042 match Input.peek t.input with 1043 | Some c when Input.is_alnum c || c = '-' -> 1044 Buffer.add_char name_buf c; 1045 ignore (Input.next t.input); 1046 true 1047 | _ -> false 1048 do () done; 1049 let name = Buffer.contents name_buf in 1050 1051 (* Skip blanks *) 1052 while Input.next_is_blank t.input do 1053 ignore (Input.next t.input) 1054 done; 1055 1056 match name with 1057 | "YAML" -> 1058 (* Version directive: %YAML 1.2 *) 1059 let major = ref 0 in 1060 let minor = ref 0 in 1061 (* Read major version *) 1062 while Input.next_is_digit t.input do 1063 major := !major * 10 + (Char.code (Input.next_exn t.input) - Char.code '0') 1064 done; 1065 (* Expect . *) 1066 (match Input.peek t.input with 1067 | Some '.' -> ignore (Input.next t.input) 1068 | _ -> Error.raise_at (Input.mark t.input) (Invalid_yaml_version "expected '.'")); 1069 (* Read minor version *) 1070 while Input.next_is_digit t.input do 1071 minor := !minor * 10 + (Char.code (Input.next_exn t.input) - Char.code '0') 1072 done; 1073 (* Validate: only whitespace and comments allowed before line break (MUS6) *) 1074 skip_whitespace_and_comment t; 1075 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then 1076 Error.raise_at (Input.mark t.input) (Invalid_directive "expected comment or line break after version"); 1077 let span = Span.make ~start ~stop:(Input.mark t.input) in 1078 Token.Version_directive { major = !major; minor = !minor }, span 1079 1080 | "TAG" -> 1081 (* Tag directive: %TAG !foo! tag:example.com,2000: *) 1082 let handle = scan_tag_handle t in 1083 (* Skip blanks *) 1084 while Input.next_is_blank t.input do 1085 ignore (Input.next t.input) 1086 done; 1087 (* Read prefix *) 1088 let prefix_buf = Buffer.create 32 in 1089 while 1090 match Input.peek t.input with 1091 | Some c when not (Input.is_whitespace c) -> 1092 Buffer.add_char prefix_buf c; 1093 ignore (Input.next t.input); 1094 true 1095 | _ -> false 1096 do () done; 1097 let prefix = Buffer.contents prefix_buf in 1098 let span = Span.make ~start ~stop:(Input.mark t.input) in 1099 Token.Tag_directive { handle; prefix }, span 1100 1101 | _ -> 1102 (* Reserved/Unknown directive - skip to end of line and ignore *) 1103 (* Per YAML spec, reserved directives should be ignored with a warning *) 1104 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do 1105 ignore (Input.next t.input) 1106 done; 1107 let span = Span.make ~start ~stop:(Input.mark t.input) in 1108 (* Return an empty tag directive token to indicate directive was processed but ignored *) 1109 Token.Tag_directive { handle = ""; prefix = "" }, span 1110 1111(** Fetch the next token(s) into the queue *) 1112let rec fetch_next_token t = 1113 skip_to_next_token t; 1114 stale_simple_keys t; 1115 let col = column t in 1116 (* Unroll indents that are deeper than current column. 1117 Note: we use col, not col-1, to allow entries at the same level. *) 1118 unroll_indent t col; 1119 1120 (* We're about to process actual content, not leading whitespace *) 1121 t.leading_whitespace <- false; 1122 1123 if Input.is_eof t.input then 1124 fetch_stream_end t 1125 else if Input.at_document_boundary t.input then 1126 fetch_document_indicator t 1127 else begin 1128 match Input.peek t.input with 1129 | None -> fetch_stream_end t 1130 | Some '%' when (Input.position t.input).column = 1 -> 1131 fetch_directive t 1132 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start 1133 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start 1134 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end 1135 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end 1136 | Some ',' -> fetch_flow_entry t 1137 | Some '-' when t.flow_level = 0 && check_block_entry t -> 1138 fetch_block_entry t 1139 | Some '?' when check_key t -> 1140 fetch_key t 1141 | Some ':' when check_value t -> 1142 fetch_value t 1143 | Some '*' -> fetch_alias t 1144 | Some '&' -> fetch_anchor t 1145 | Some '!' -> fetch_tag t 1146 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true 1147 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false 1148 | Some '\'' -> fetch_single_quoted t 1149 | Some '"' -> fetch_double_quoted t 1150 | Some '-' when can_start_plain t -> 1151 fetch_plain_scalar t 1152 | Some '?' when can_start_plain t -> 1153 fetch_plain_scalar t 1154 | Some ':' when can_start_plain t -> 1155 fetch_plain_scalar t 1156 | Some c when can_start_plain_char c t -> 1157 fetch_plain_scalar t 1158 | Some c -> 1159 Error.raise_at (Input.mark t.input) (Unexpected_character c) 1160 end 1161 1162and fetch_stream_end t = 1163 if not t.stream_ended then begin 1164 unroll_indent t (-1); 1165 remove_simple_key t; 1166 t.allow_simple_key <- false; 1167 t.stream_ended <- true; 1168 let span = Span.point (Input.mark t.input) in 1169 emit t span Token.Stream_end 1170 end 1171 1172and fetch_document_indicator t = 1173 unroll_indent t (-1); 1174 remove_simple_key t; 1175 t.allow_simple_key <- false; 1176 let start = Input.mark t.input in 1177 let indicator = Input.peek_string t.input 3 in 1178 Input.skip t.input 3; 1179 let span = Span.make ~start ~stop:(Input.mark t.input) in 1180 let token = if indicator = "---" then Token.Document_start else Token.Document_end in 1181 (* Reset document content flag after document end marker *) 1182 if indicator = "..." then begin 1183 t.document_has_content <- false; 1184 (* After document end marker, skip whitespace and check for end of line or comment *) 1185 while Input.next_is_blank t.input do ignore (Input.next t.input) done; 1186 (match Input.peek t.input with 1187 | None -> () (* EOF is ok *) 1188 | Some c when Input.is_break c -> () 1189 | Some '#' -> () (* Comment is ok *) 1190 | _ -> Error.raise_at start (Invalid_directive "content not allowed after document end marker on same line")) 1191 end; 1192 emit t span token 1193 1194and fetch_directive t = 1195 (* Directives can only appear: 1196 1. At stream start (before any document content) 1197 2. After a document end marker (...) 1198 If we've emitted content in the current document, we need a document end marker first *) 1199 if t.document_has_content then 1200 Error.raise_at (Input.mark t.input) 1201 (Unexpected_token "directives must be separated from document content by document end marker (...)"); 1202 unroll_indent t (-1); 1203 remove_simple_key t; 1204 t.allow_simple_key <- false; 1205 let token, span = scan_directive t in 1206 emit t span token 1207 1208and fetch_flow_collection_start t token_type = 1209 save_simple_key t; 1210 (* Record indent of outermost flow collection *) 1211 if t.flow_level = 0 then 1212 t.flow_indent <- column t; 1213 t.flow_level <- t.flow_level + 1; 1214 (* Track whether this is a mapping or sequence *) 1215 let is_mapping = (token_type = Token.Flow_mapping_start) in 1216 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack; 1217 t.allow_simple_key <- true; 1218 t.simple_keys <- None :: t.simple_keys; 1219 t.document_has_content <- true; 1220 let start = Input.mark t.input in 1221 ignore (Input.next t.input); 1222 let span = Span.make ~start ~stop:(Input.mark t.input) in 1223 emit t span token_type 1224 1225and fetch_flow_collection_end t token_type = 1226 remove_simple_key t; 1227 t.flow_level <- t.flow_level - 1; 1228 t.flow_mapping_stack <- (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []); 1229 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []); 1230 t.allow_simple_key <- false; 1231 let start = Input.mark t.input in 1232 ignore (Input.next t.input); 1233 (* Allow adjacent values after flow collection ends *) 1234 if t.flow_level > 0 then 1235 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1236 let span = Span.make ~start ~stop:(Input.mark t.input) in 1237 emit t span token_type 1238 1239and fetch_flow_entry t = 1240 remove_simple_key t; 1241 t.allow_simple_key <- true; 1242 let start = Input.mark t.input in 1243 ignore (Input.next t.input); 1244 let span = Span.make ~start ~stop:(Input.mark t.input) in 1245 emit t span Token.Flow_entry 1246 1247and check_block_entry t = 1248 (* - followed by whitespace or EOF *) 1249 match Input.peek_nth t.input 1 with 1250 | None -> true 1251 | Some c -> Input.is_whitespace c 1252 1253and fetch_block_entry t = 1254 if t.flow_level = 0 then begin 1255 (* Block entries require allow_simple_key to be true. 1256 This prevents block sequences on the same line as a mapping value, 1257 e.g., "key: - a" is invalid. *) 1258 if not t.allow_simple_key then 1259 Error.raise_at (Input.mark t.input) Block_sequence_disallowed; 1260 let col = column t in 1261 if roll_indent t col then begin 1262 let span = Span.point (Input.mark t.input) in 1263 emit t span Token.Block_sequence_start 1264 end 1265 end; 1266 remove_simple_key t; 1267 t.allow_simple_key <- true; 1268 t.document_has_content <- true; 1269 let start = Input.mark t.input in 1270 ignore (Input.next t.input); 1271 1272 (* Check for tabs after - : pattern like -\t- is invalid *) 1273 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in 1274 if found_tabs then begin 1275 (* If we found tabs and next char is - followed by whitespace, error *) 1276 match Input.peek t.input with 1277 | Some '-' -> 1278 (match Input.peek_nth t.input 1 with 1279 | None -> Error.raise_at start Tab_in_indentation 1280 | Some c when Input.is_whitespace c -> 1281 Error.raise_at start Tab_in_indentation 1282 | Some _ -> ()) 1283 | _ -> () 1284 end; 1285 1286 let span = Span.make ~start ~stop:(Input.mark t.input) in 1287 emit t span Token.Block_entry 1288 1289and check_key t = 1290 (* ? followed by whitespace or flow indicator in both block and flow *) 1291 match Input.peek_nth t.input 1 with 1292 | None -> true 1293 | Some c -> 1294 Input.is_whitespace c || 1295 (t.flow_level > 0 && Input.is_flow_indicator c) 1296 1297and fetch_key t = 1298 if t.flow_level = 0 then begin 1299 if not t.allow_simple_key then 1300 Error.raise_at (Input.mark t.input) Expected_key; 1301 let col = column t in 1302 if roll_indent t col then begin 1303 let span = Span.point (Input.mark t.input) in 1304 emit t span Token.Block_mapping_start 1305 end 1306 end; 1307 remove_simple_key t; 1308 t.allow_simple_key <- t.flow_level = 0; 1309 t.document_has_content <- true; 1310 let start = Input.mark t.input in 1311 ignore (Input.next t.input); 1312 1313 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *) 1314 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in 1315 if found_tabs && t.flow_level = 0 then begin 1316 (* In block context, tabs after ? are not allowed *) 1317 Error.raise_at start Tab_in_indentation 1318 end; 1319 1320 let span = Span.make ~start ~stop:(Input.mark t.input) in 1321 emit t span Token.Key 1322 1323and check_value t = 1324 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *) 1325 match Input.peek_nth t.input 1 with 1326 | None -> true 1327 | Some c -> 1328 Input.is_whitespace c || 1329 (t.flow_level > 0 && Input.is_flow_indicator c) || 1330 (* Allow adjacent values in flow context at designated positions *) 1331 (t.flow_level > 0 && 1332 match t.adjacent_value_allowed_at with 1333 | Some pos -> pos.Position.line = (Input.position t.input).Position.line && 1334 pos.Position.column = (Input.position t.input).Position.column 1335 | None -> false) 1336 1337and fetch_value t = 1338 let start = Input.mark t.input in 1339 (* Check for simple key *) 1340 let used_simple_key = 1341 match t.simple_keys with 1342 | Some sk :: _ when sk.sk_possible -> 1343 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line. 1344 In explicit flow mapping { }, key and : can span lines. *) 1345 let is_implicit_flow_mapping = match t.flow_mapping_stack with 1346 | false :: _ -> true (* false = we're in a sequence, so any mapping is implicit *) 1347 | _ -> false 1348 in 1349 if is_implicit_flow_mapping && sk.sk_position.line < (Input.position t.input).line then 1350 Error.raise_at start Illegal_flow_key_line; 1351 (* Insert KEY token before the simple key value *) 1352 let key_span = Span.point sk.sk_position in 1353 let key_token = { Token.token = Token.Key; span = key_span } in 1354 (* We need to insert at the right position *) 1355 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1356 Queue.clear t.tokens; 1357 let insert_pos = sk.sk_token_number - t.tokens_taken in 1358 Array.iteri (fun i tok -> 1359 if i = insert_pos then Queue.add key_token t.tokens; 1360 Queue.add tok t.tokens 1361 ) tokens; 1362 if insert_pos >= Array.length tokens then 1363 Queue.add key_token t.tokens; 1364 t.token_number <- t.token_number + 1; 1365 (* Roll indent for implicit block mapping *) 1366 if t.flow_level = 0 then begin 1367 let col = sk.sk_position.column in 1368 if roll_indent t col then begin 1369 let span = Span.point sk.sk_position in 1370 (* Insert block mapping start before key *) 1371 let bm_token = { Token.token = Token.Block_mapping_start; span } in 1372 let tokens = Queue.to_seq t.tokens |> Array.of_seq in 1373 Queue.clear t.tokens; 1374 Array.iteri (fun i tok -> 1375 if i = insert_pos then Queue.add bm_token t.tokens; 1376 Queue.add tok t.tokens 1377 ) tokens; 1378 if insert_pos >= Array.length tokens then 1379 Queue.add bm_token t.tokens; 1380 t.token_number <- t.token_number + 1 1381 end 1382 end; 1383 t.simple_keys <- None :: (List.tl t.simple_keys); 1384 true 1385 | _ -> 1386 (* No simple key - this is a complex value (or empty key) *) 1387 if t.flow_level = 0 then begin 1388 if not t.allow_simple_key then 1389 Error.raise_at (Input.mark t.input) Expected_key; 1390 let col = column t in 1391 if roll_indent t col then begin 1392 let span = Span.point (Input.mark t.input) in 1393 emit t span Token.Block_mapping_start 1394 end 1395 (* Note: We don't emit KEY here. Empty key handling is done by the parser, 1396 which emits empty scalar when it sees VALUE without preceding KEY. *) 1397 end; 1398 false 1399 in 1400 remove_simple_key t; 1401 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white" 1402 In flow context or after using a simple key, disallow simple keys *) 1403 t.allow_simple_key <- (not used_simple_key) && (t.flow_level = 0); 1404 t.document_has_content <- true; 1405 let start = Input.mark t.input in 1406 ignore (Input.next t.input); 1407 1408 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09) 1409 However, :\t bar (tab followed by space then content) is valid (6BCT) *) 1410 let (found_tabs, found_spaces) = skip_blanks_check_tabs t in 1411 if found_tabs && not found_spaces && t.flow_level = 0 then begin 1412 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *) 1413 match Input.peek t.input with 1414 | Some ('-' | '?') -> 1415 Error.raise_at start Tab_in_indentation 1416 | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') -> 1417 (* Tab-only followed by alphanumeric - likely a key, which is invalid *) 1418 Error.raise_at start Tab_in_indentation 1419 | _ -> () 1420 end; 1421 1422 (* Skip any comment that may follow the colon and whitespace *) 1423 skip_whitespace_and_comment t; 1424 1425 let span = Span.make ~start ~stop:(Input.mark t.input) in 1426 emit t span Token.Value 1427 1428and fetch_anchor_or_alias t ~is_alias = 1429 save_simple_key t; 1430 t.allow_simple_key <- false; 1431 t.document_has_content <- true; 1432 let start = Input.mark t.input in 1433 ignore (Input.next t.input); (* consume * or & *) 1434 let name, span = scan_anchor_alias t in 1435 let span = Span.make ~start ~stop:span.stop in 1436 let token = if is_alias then Token.Alias name else Token.Anchor name in 1437 emit t span token 1438 1439and fetch_alias t = fetch_anchor_or_alias t ~is_alias:true 1440and fetch_anchor t = fetch_anchor_or_alias t ~is_alias:false 1441 1442and fetch_tag t = 1443 save_simple_key t; 1444 t.allow_simple_key <- false; 1445 t.document_has_content <- true; 1446 let handle, suffix, span = scan_tag t in 1447 emit t span (Token.Tag { handle; suffix }) 1448 1449and fetch_block_scalar t literal = 1450 remove_simple_key t; 1451 t.allow_simple_key <- true; 1452 t.document_has_content <- true; 1453 let value, style, span = scan_block_scalar t literal in 1454 emit t span (Token.Scalar { style; value }) 1455 1456and fetch_quoted t ~double = 1457 save_simple_key t; 1458 t.allow_simple_key <- false; 1459 t.document_has_content <- true; 1460 let value, span = 1461 if double then scan_double_quoted t else scan_single_quoted t 1462 in 1463 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *) 1464 skip_to_next_token t; 1465 if t.flow_level > 0 then 1466 t.adjacent_value_allowed_at <- Some (Input.position t.input); 1467 let style = if double then `Double_quoted else `Single_quoted in 1468 emit t span (Token.Scalar { style; value }) 1469 1470and fetch_single_quoted t = fetch_quoted t ~double:false 1471and fetch_double_quoted t = fetch_quoted t ~double:true 1472 1473and can_start_plain t = 1474 (* Check if - ? : can start a plain scalar *) 1475 match Input.peek_nth t.input 1 with 1476 | None -> false 1477 | Some c -> 1478 not (Input.is_whitespace c) && 1479 (t.flow_level = 0 || not (Input.is_flow_indicator c)) 1480 1481and can_start_plain_char c _t = 1482 (* Characters that can start a plain scalar *) 1483 if Input.is_whitespace c then false 1484 else if Input.is_indicator c then false 1485 else true 1486 1487and fetch_plain_scalar t = 1488 save_simple_key t; 1489 t.allow_simple_key <- false; 1490 t.document_has_content <- true; 1491 let value, span, ended_with_linebreak = scan_plain_scalar t in 1492 (* If the plain scalar ended after crossing a line break (leading_blanks = true), 1493 allow simple keys. This is important because the scanner already consumed the 1494 line break and leading whitespace when checking for continuation. *) 1495 if ended_with_linebreak then 1496 t.allow_simple_key <- true; 1497 emit t span (Token.Scalar { style = `Plain; value }) 1498 1499(** Check if we need more tokens to resolve simple keys *) 1500let need_more_tokens t = 1501 if t.stream_ended then false 1502 else if Queue.is_empty t.tokens then true 1503 else 1504 (* Check if any simple key could affect the first queued token *) 1505 List.exists (function 1506 | Some sk when sk.sk_possible -> 1507 sk.sk_token_number >= t.tokens_taken 1508 | _ -> false 1509 ) t.simple_keys 1510 1511(** Ensure we have enough tokens to return one safely *) 1512let ensure_tokens t = 1513 if not t.stream_started then begin 1514 t.stream_started <- true; 1515 let span = Span.point (Input.position t.input) in 1516 let encoding, _ = Encoding.detect (Input.source t.input) in 1517 emit t span (Token.Stream_start encoding) 1518 end; 1519 while need_more_tokens t do 1520 fetch_next_token t 1521 done 1522 1523(** Get next token *) 1524let next t = 1525 ensure_tokens t; 1526 if Queue.is_empty t.tokens then 1527 None 1528 else begin 1529 t.tokens_taken <- t.tokens_taken + 1; 1530 Some (Queue.pop t.tokens) 1531 end 1532 1533(** Peek at next token *) 1534let peek t = 1535 ensure_tokens t; 1536 Queue.peek_opt t.tokens 1537 1538(** Iterate over all tokens *) 1539let iter f t = 1540 let rec loop () = 1541 match next t with 1542 | None -> () 1543 | Some tok -> f tok; loop () 1544 in 1545 loop () 1546 1547(** Fold over all tokens *) 1548let fold f init t = 1549 let rec loop acc = 1550 match next t with 1551 | None -> acc 1552 | Some tok -> loop (f acc tok) 1553 in 1554 loop init 1555 1556(** Convert to list *) 1557let to_list t = 1558 fold (fun acc tok -> tok :: acc) [] t |> List.rev