···
mutable allow_simple_key : bool;
mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *)
mutable document_has_content : bool; (** True if we've emitted content tokens in current document *)
+
mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *)
+
mutable pending_value : bool; (** True if we've emitted a KEY and are waiting for VALUE *)
···
leading_whitespace = true; (* Start at beginning of stream *)
document_has_content = false;
+
adjacent_value_allowed_at = None;
let of_string s = create (Input.of_string s)
···
(** Get current indent level *)
match t.indent_stack with
| { indent; _ } :: _ -> indent
(** Skip whitespace to end of line, checking for valid comments.
···
if Input.next_is (( = ) '#') t.input then begin
(* Validate: comment must be preceded by whitespace or be at start of line *)
if not !has_whitespace then begin
+
(* Check if we're at the start of input or after whitespace (blank or line break) *)
match Input.peek_back t.input with
| None -> () (* Start of input - OK *)
+
| Some c when Input.is_whitespace c -> () (* After whitespace - OK *)
(* Comment not preceded by whitespace - ERROR *)
Error.raise_at (Input.mark t.input) Invalid_comment
···
(* Check for tabs used as indentation in block context *)
(match Input.peek t.input with
| Some '\t' when t.flow_level = 0 && t.leading_whitespace &&
+
(column t - 1) < current_indent t ->
(* Tab found in indentation zone - this is invalid *)
(* Skip to end of line to check if line has content *)
let start_pos = Input.mark t.input in
···
else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
+
(* In flow context, skip all whitespace including line breaks *)
+
if Input.next_is_break t.input then begin
+
Input.consume_break t.input;
+
(* Allow simple keys after line breaks in flow context *)
+
t.allow_simple_key <- true;
+
ignore (Input.next t.input);
(** Roll the indentation level *)
···
(** Scan tag suffix (after handle) *)
+
(c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
+
| '0'..'9' -> Char.code c - Char.code '0'
+
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
+
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
let buf = Buffer.create 32 in
match Input.peek t.input with
+
(* Percent-encoded character *)
+
ignore (Input.next t.input);
+
(match Input.peek t.input, Input.peek_nth t.input 1 with
+
| Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 ->
+
ignore (Input.next t.input);
+
ignore (Input.next t.input);
+
let code = (hex_val c1) * 16 + (hex_val c2) in
+
Buffer.add_char buf (Char.chr code);
+
(* Invalid percent encoding - keep the % *)
+
Buffer.add_char buf '%';
| Some c when not (Input.is_whitespace c) &&
not (Input.is_flow_indicator c) ->
···
let start = Input.mark t.input in
ignore (Input.next t.input); (* consume opening double-quote *)
let buf = Buffer.create 64 in
+
let whitespace = Buffer.create 16 in (* Track pending whitespace *)
+
let flush_whitespace () =
+
if Buffer.length whitespace > 0 then begin
+
Buffer.add_buffer buf whitespace;
+
Buffer.clear whitespace
match Input.peek t.input with
| None -> Error.raise_at start Unclosed_double_quote
+
(* Flush trailing whitespace before closing quote to preserve it *)
ignore (Input.next t.input)
+
| Some ' ' | Some '\t' as c_opt ->
+
(* Track whitespace - don't add to buf yet *)
+
let c = match c_opt with Some c -> c | None -> assert false in
+
Buffer.add_char whitespace c;
+
ignore (Input.next t.input);
+
(* Escape sequence - this is non-whitespace content *)
+
flush_whitespace (); (* Commit any pending whitespace *)
ignore (Input.next t.input);
(match Input.peek t.input with
| None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>")
···
ignore (Input.next t.input);
Buffer.add_string buf (decode_hex t 8)
| Some '\n' | Some '\r' ->
+
(* Line continuation escape *)
Input.consume_break t.input;
while Input.next_is_blank t.input do
ignore (Input.next t.input)
···
(Invalid_escape_sequence (Printf.sprintf "\\%c" c)));
| Some '\n' | Some '\r' ->
+
(* Line break: discard any pending trailing whitespace *)
+
Buffer.clear whitespace;
Input.consume_break t.input;
(* Count consecutive line breaks (empty lines) *)
let empty_lines = ref 0 in
···
+
(* Non-whitespace character *)
+
flush_whitespace (); (* Commit any pending whitespace *)
ignore (Input.next t.input);
···
let start = Input.mark t.input in
let in_flow = t.flow_level > 0 in
let indent = current_indent t in
+
(* In flow context, scalars must be indented more than the current block indent.
+
This ensures that content at block indent or less ends the flow context. *)
+
if in_flow && (column t - 1) < indent then
Error.raise_at start Invalid_flow_indentation;
let buf = Buffer.create 64 in
let spaces = Buffer.create 16 in
+
let whitespace = Buffer.create 16 in (* Track whitespace within a line *)
let leading_blanks = ref false in
match Input.peek t.input with
+
| Some c when Input.is_blank c && can_continue_plain t c ~in_flow ->
+
(* Blank character within a line - save to whitespace buffer *)
+
Buffer.add_char whitespace c;
+
ignore (Input.next t.input);
| Some c when can_continue_plain t c ~in_flow ->
+
(* Non-blank character - process any pending breaks/whitespace first *)
if Buffer.length spaces > 0 then begin
if !leading_blanks then begin
···
Buffer.add_buffer buf spaces;
+
(* Add any pending whitespace from within the line *)
+
if Buffer.length whitespace > 0 then begin
+
Buffer.add_buffer buf whitespace;
+
Buffer.clear whitespace
+
(* Add the character *)
ignore (Input.next t.input);
···
(* Check for line continuation *)
+
if Input.next_is_break t.input then begin
+
(* Discard any trailing whitespace from the current line *)
+
Buffer.clear whitespace;
(* Save the line break *)
if !leading_blanks then begin
(* We already had a break - this is an additional break (empty line) *)
···
Input.consume_break t.input;
+
(* Line break allows simple key in both block and flow contexts *)
+
t.allow_simple_key <- true;
+
t.allow_simple_key <- true;
(* Skip leading blanks on the next line *)
while Input.next_is_blank t.input do
ignore (Input.next t.input)
let col = (Input.position t.input).column in
(* Check indentation - stop if we're at or before the containing block's indent *)
+
(* However, allow empty lines (line breaks) to continue even if dedented *)
+
if Input.next_is_break t.input then
+
scan_lines () (* Empty line - continue *)
+
else if not in_flow && col <= indent then
+
() (* Stop - dedented or at parent level in block context *)
else if Input.at_document_boundary t.input then
() (* Stop - document boundary *)
···
let buf = Buffer.create 256 in
let trailing_breaks = Buffer.create 16 in
+
let leading_blank = ref false in (* Was the previous line "more indented"? *)
(* Skip to content indentation, skipping empty lines.
Returns the number of spaces actually skipped (important for detecting dedentation). *)
···
(* Check if we found a break (empty line) or content *)
(match Input.peek_nth t.input (!idx) with
| None | Some '\n' | Some '\r' ->
+
(* Empty line - preserve spaces for literal scalars *)
+
while Input.next_is_blank t.input do
+
Buffer.add_char trailing_breaks ' ';
+
ignore (Input.next t.input)
+
while Input.next_is_blank t.input do
+
ignore (Input.next t.input)
Buffer.add_char trailing_breaks '\n';
Input.consume_break t.input;
skip_to_content_indent ()
···
let line_indent = spaces_skipped + !extra_spaces in
(* Determine content indent from first content line (implicit case) *)
+
let first_line = !content_indent = 0 in
if !content_indent = 0 then begin
if line_indent <= base_indent then begin
(* No content - restore position conceptually *)
···
(* Dedented - done with content *)
+
(* Check if current line is "more indented" (has extra indent beyond content_indent) *)
+
let trailing_blank = line_indent > !content_indent in
(* Add trailing breaks to buffer *)
if Buffer.length buf > 0 then begin
if Buffer.length trailing_breaks > 0 then begin
Buffer.add_buffer buf trailing_breaks
+
(* Folded scalar: fold only if both previous and current lines are not more-indented *)
+
if not !leading_blank && not trailing_blank then begin
+
let breaks = Buffer.contents trailing_breaks in
+
if String.length breaks = 1 then
+
Buffer.add_char buf ' '
+
Buffer.add_substring buf breaks 1 (String.length breaks - 1)
+
(* Preserve breaks for more-indented lines *)
+
Buffer.add_buffer buf trailing_breaks
end else if not literal then
···
Buffer.add_buffer buf trailing_breaks;
Buffer.clear trailing_breaks;
+
(* Add extra indentation for literal or more-indented folded lines *)
+
(* On the first line (when determining content_indent), we've already consumed all spaces,
+
so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *)
+
if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin
+
for _ = 1 to !extra_spaces do
···
Buffer.add_char trailing_breaks '\n';
Input.consume_break t.input
+
(* Update leading_blank for next iteration *)
+
leading_blank := trailing_blank;
···
ignore (Input.next t.input)
(* Version directive: %YAML 1.2 *)
···
let span = Span.make ~start ~stop:(Input.mark t.input) in
Token.Tag_directive { handle; prefix }, span
+
(* Reserved/Unknown directive - skip to end of line and ignore *)
+
(* Per YAML spec, reserved directives should be ignored with a warning *)
while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
ignore (Input.next t.input)
+
let span = Span.make ~start ~stop:(Input.mark t.input) in
+
(* Return an empty tag directive token to indicate directive was processed but ignored *)
+
Token.Tag_directive { handle = ""; prefix = "" }, span
(** Fetch the next token(s) into the queue *)
let rec fetch_next_token t =
···
| Some ',' -> fetch_flow_entry t
| Some '-' when t.flow_level = 0 && check_block_entry t ->
+
| Some '?' when check_key t ->
| Some ':' when check_value t ->
···
t.allow_simple_key <- false;
let start = Input.mark t.input in
ignore (Input.next t.input);
+
(* Allow adjacent values after flow collection ends *)
+
if t.flow_level > 0 then
+
t.adjacent_value_allowed_at <- Some (Input.position t.input);
let span = Span.make ~start ~stop:(Input.mark t.input) in
···
and fetch_block_entry t =
if t.flow_level = 0 then begin
+
(* Block entries don't require allow_simple_key to be true, because:
+
1. They're not simple keys themselves
+
2. They can appear after : on the same line (e.g., ": - a")
+
So we only check allow_simple_key in contexts where it's truly required.
+
For now, we allow block entries in block context. *)
if roll_indent t col ~sequence:true then begin
let span = Span.point (Input.mark t.input) in
···
emit t span Token.Block_entry
+
(* ? followed by whitespace or flow indicator in both block and flow *)
+
match Input.peek_nth t.input 1 with
+
Input.is_whitespace c ||
+
(t.flow_level > 0 && Input.is_flow_indicator c)
if t.flow_level = 0 then begin
···
let span = Span.make ~start ~stop:(Input.mark t.input) in
+
t.pending_value <- true (* We've emitted a KEY, now waiting for VALUE *)
+
(* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *)
+
match Input.peek_nth t.input 1 with
+
Input.is_whitespace c ||
+
(t.flow_level > 0 && Input.is_flow_indicator c) ||
+
(* Allow adjacent values in flow context at designated positions *)
+
match t.adjacent_value_allowed_at with
+
| Some pos -> pos.Position.line = (Input.position t.input).Position.line &&
+
pos.Position.column = (Input.position t.input).Position.column
(* Check for simple key *)
···
if insert_pos >= Array.length tokens then
Queue.add key_token t.tokens;
t.token_number <- t.token_number + 1;
+
t.pending_value <- true; (* We've inserted a KEY token, now waiting for VALUE *)
(* Roll indent for implicit block mapping *)
if t.flow_level = 0 then begin
let col = sk.sk_position.column in
···
t.simple_keys <- None :: (List.tl t.simple_keys)
+
(* No simple key - this is a complex value (or empty key) *)
if t.flow_level = 0 then begin
if not t.allow_simple_key then
Error.raise_at (Input.mark t.input) Expected_key;
···
if roll_indent t col ~sequence:false then begin
let span = Span.point (Input.mark t.input) in
emit t span Token.Block_mapping_start
+
(* Emit KEY token for empty key case (e.g., ": value") only if we don't already have a pending KEY *)
+
if not t.pending_value then begin
+
let span = Span.point (Input.mark t.input) in
+
t.pending_value <- true
···
+
(* Skip any comment that may follow the colon and whitespace *)
+
skip_whitespace_and_comment t;
let span = Span.make ~start ~stop:(Input.mark t.input) in
+
emit t span Token.Value;
+
t.pending_value <- false (* We've emitted a VALUE, no longer pending *)
···
t.allow_simple_key <- false;
t.document_has_content <- true;
let value, span = scan_single_quoted t in
+
(* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
+
if t.flow_level > 0 then
+
t.adjacent_value_allowed_at <- Some (Input.position t.input);
emit t span (Token.Scalar { style = Scalar_style.Single_quoted; value })
and fetch_double_quoted t =
···
t.allow_simple_key <- false;
t.document_has_content <- true;
let value, span = scan_double_quoted t in
+
(* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
+
if t.flow_level > 0 then
+
t.adjacent_value_allowed_at <- Some (Input.position t.input);
emit t span (Token.Scalar { style = Scalar_style.Double_quoted; value })