Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *)
7
8(** Simple key tracking for mapping key disambiguation *)
9type simple_key = {
10 sk_possible : bool;
11 sk_required : bool;
12 sk_token_number : int;
13 sk_position : Position.t;
14}
15
16(** Indent level tracking *)
17type indent = {
18 indent : int;
19 needs_block_end : bool;
20 sequence : bool; (** true if this is a sequence indent *)
21}
22
23type t = {
24 input : Input.t;
25 mutable tokens : Token.spanned Queue.t;
26 mutable token_number : int;
27 mutable tokens_taken : int;
28 mutable stream_started : bool;
29 mutable stream_ended : bool;
30 mutable indent_stack : indent list; (** Stack of indentation levels *)
31 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *)
32 mutable flow_indent : int; (** Column where outermost flow collection started *)
33 mutable simple_keys : simple_key option list; (** Per flow-level simple key tracking *)
34 mutable allow_simple_key : bool;
35 mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *)
36 mutable document_has_content : bool; (** True if we've emitted content tokens in current document *)
37 mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *)
38 mutable pending_value : bool; (** True if we've emitted a KEY and are waiting for VALUE *)
39 mutable flow_mapping_stack : bool list; (** Stack of whether each flow level is a mapping *)
40}
41
42let create input =
43 {
44 input;
45 tokens = Queue.create ();
46 token_number = 0;
47 tokens_taken = 0;
48 stream_started = false;
49 stream_ended = false;
50 indent_stack = [];
51 flow_level = 0;
52 flow_indent = 0;
53 simple_keys = [None]; (* One entry for the base level *)
54 allow_simple_key = true;
55 leading_whitespace = true; (* Start at beginning of stream *)
56 document_has_content = false;
57 adjacent_value_allowed_at = None;
58 pending_value = false;
59 flow_mapping_stack = [];
60 }
61
62let of_string s = create (Input.of_string s)
63let of_input = create
64let of_reader r = create (Input.of_reader r)
65
66let position t = Input.position t.input
67
68(** Add a token to the queue *)
69let emit t span token =
70 Queue.add { Token.token; span } t.tokens;
71 t.token_number <- t.token_number + 1
72
73(** Get current column (1-indexed) *)
74let column t = (Input.position t.input).column
75
76(** Get current indent level *)
77let current_indent t =
78 match t.indent_stack with
79 | [] -> -1
80 | { indent; _ } :: _ -> indent
81
82(** Skip whitespace to end of line, checking for valid comments.
83 Returns true if any whitespace (including tabs) was found before a comment. *)
84let skip_whitespace_and_comment t =
85 let has_whitespace = ref false in
86 (* Skip blanks (spaces and tabs) *)
87 while Input.next_is_blank t.input do
88 has_whitespace := true;
89 ignore (Input.next t.input)
90 done;
91 (* Check for comment *)
92 if Input.next_is (( = ) '#') t.input then begin
93 (* Validate: comment must be preceded by whitespace or be at start of line *)
94 if not !has_whitespace then begin
95 (* Check if we're at the start of input or after whitespace (blank or line break) *)
96 match Input.peek_back t.input with
97 | None -> () (* Start of input - OK *)
98 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *)
99 | _ ->
100 (* Comment not preceded by whitespace - ERROR *)
101 Error.raise_at (Input.mark t.input) Invalid_comment
102 end;
103 (* Skip to end of line *)
104 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
105 ignore (Input.next t.input)
106 done
107 end
108
109(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *)
110let skip_blanks_check_tabs t =
111 let found_tab = ref false in
112 let found_space = ref false in
113 while Input.next_is_blank t.input do
114 (match Input.peek t.input with
115 | Some '\t' -> found_tab := true
116 | Some ' ' -> found_space := true
117 | _ -> ());
118 ignore (Input.next t.input)
119 done;
120 (!found_tab, !found_space)
121
122(** Skip whitespace and comments, return true if at newline *)
123let rec skip_to_next_token t =
124 (* Check for tabs used as indentation in block context *)
125 (match Input.peek t.input with
126 | Some '\t' when t.flow_level = 0 && t.leading_whitespace &&
127 (column t - 1) < current_indent t ->
128 (* Tab found in indentation zone - this is invalid *)
129 (* Skip to end of line to check if line has content *)
130 let start_pos = Input.mark t.input in
131 while Input.next_is_blank t.input do
132 ignore (Input.next t.input)
133 done;
134 (* If we have content on this line with a tab, raise error *)
135 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
136 Error.raise_at start_pos Tab_in_indentation
137 | _ -> ());
138
139 (* Skip blanks and validate comments *)
140 skip_whitespace_and_comment t;
141 (* Skip line break in block context *)
142 if t.flow_level = 0 && Input.next_is_break t.input then begin
143 Input.consume_break t.input;
144 t.allow_simple_key <- true;
145 t.leading_whitespace <- true;
146 skip_to_next_token t
147 end
148 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
149 (* In flow context, skip all whitespace including line breaks *)
150 if Input.next_is_break t.input then begin
151 Input.consume_break t.input;
152 (* Allow simple keys after line breaks in flow context *)
153 t.allow_simple_key <- true;
154 (* After line break in flow, check for tabs at start of line (Y79Y/03)
155 Tabs are not allowed as indentation - if tab is first char and results
156 in a column less than flow_indent, it's an error *)
157 if Input.next_is (( = ) '\t') t.input then begin
158 (* Tab at start of line in flow context - skip tabs and check position *)
159 let start_mark = Input.mark t.input in
160 while Input.next_is (( = ) '\t') t.input do
161 ignore (Input.next t.input)
162 done;
163 (* If only tabs were used (no spaces) and column < flow_indent, error *)
164 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) &&
165 column t < t.flow_indent then
166 Error.raise_at start_mark Invalid_flow_indentation
167 end;
168 skip_to_next_token t
169 end else begin
170 ignore (Input.next t.input);
171 skip_to_next_token t
172 end
173 end
174
175(** Roll the indentation level *)
176let roll_indent t col ~sequence =
177 if t.flow_level = 0 && col > current_indent t then begin
178 t.indent_stack <- { indent = col; needs_block_end = true; sequence } :: t.indent_stack;
179 true
180 end else
181 false
182
183(** Unroll indentation to given column *)
184let unroll_indent t col =
185 while t.flow_level = 0 &&
186 match t.indent_stack with
187 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true
188 | _ -> false
189 do
190 match t.indent_stack with
191 | { indent = _; needs_block_end = true; _ } :: rest ->
192 let pos = Input.position t.input in
193 let span = Span.point pos in
194 emit t span Token.Block_end;
195 t.indent_stack <- rest
196 | _ -> ()
197 done
198
199(** Save a potential simple key *)
200let save_simple_key t =
201 if t.allow_simple_key then begin
202 (* A simple key is required only if we're in a block context,
203 at the current indentation level, AND the current indent needs a block end.
204 This matches saphyr's logic and prevents false positives for values. *)
205 let required = t.flow_level = 0 &&
206 match t.indent_stack with
207 | { indent; needs_block_end = true; _ } :: _ ->
208 indent = column t
209 | _ -> false
210 in
211 let sk = {
212 sk_possible = true;
213 sk_required = required;
214 sk_token_number = t.token_number;
215 sk_position = Input.position t.input;
216 } in
217 (* Remove any existing simple key at current level *)
218 t.simple_keys <- (
219 match t.simple_keys with
220 | _ :: rest -> Some sk :: rest
221 | [] -> [Some sk]
222 )
223 end
224
225(** Remove simple key at current level *)
226let remove_simple_key t =
227 match t.simple_keys with
228 | Some sk :: _rest when sk.sk_required ->
229 Error.raise_at sk.sk_position Expected_key
230 | _ :: rest -> t.simple_keys <- None :: rest
231 | [] -> ()
232
233(** Stale simple keys that span too many tokens *)
234let stale_simple_keys t =
235 t.simple_keys <- List.map (fun sk_opt ->
236 match sk_opt with
237 | Some sk when sk.sk_possible &&
238 (Input.position t.input).line > sk.sk_position.line &&
239 t.flow_level = 0 ->
240 if sk.sk_required then
241 Error.raise_at sk.sk_position Expected_key;
242 None
243 | _ -> sk_opt
244 ) t.simple_keys
245
246(** Read anchor or alias name *)
247let scan_anchor_alias t =
248 let start = Input.mark t.input in
249 let buf = Buffer.create 16 in
250 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT:
251 - Whitespace (space, tab, line breaks)
252 - Flow indicators: []{}
253 - Comma (,)
254 This matches the saphyr implementation: is_yaml_non_space && !is_flow *)
255 while
256 match Input.peek t.input with
257 | Some c when not (Input.is_whitespace c) &&
258 not (Input.is_flow_indicator c) &&
259 c <> '\x00' ->
260 Buffer.add_char buf c;
261 ignore (Input.next t.input);
262 true
263 | _ -> false
264 do () done;
265 let name = Buffer.contents buf in
266 if String.length name = 0 then
267 Error.raise_at start (Invalid_anchor "empty anchor name");
268 (name, Span.make ~start ~stop:(Input.mark t.input))
269
270(** Scan tag handle *)
271let scan_tag_handle t =
272 let start = Input.mark t.input in
273 let buf = Buffer.create 16 in
274 (* Expect ! *)
275 (match Input.peek t.input with
276 | Some '!' ->
277 Buffer.add_char buf '!';
278 ignore (Input.next t.input)
279 | _ -> Error.raise_at start (Invalid_tag "expected '!'"));
280 (* Read word chars *)
281 while
282 match Input.peek t.input with
283 | Some c when Input.is_alnum c || c = '-' ->
284 Buffer.add_char buf c;
285 ignore (Input.next t.input);
286 true
287 | _ -> false
288 do () done;
289 (* Check for secondary ! *)
290 (match Input.peek t.input with
291 | Some '!' ->
292 Buffer.add_char buf '!';
293 ignore (Input.next t.input)
294 | _ -> ());
295 Buffer.contents buf
296
297(** Scan tag suffix (after handle) *)
298let scan_tag_suffix t =
299 let is_hex_digit c =
300 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
301 in
302 let hex_val c =
303 match c with
304 | '0'..'9' -> Char.code c - Char.code '0'
305 | 'A'..'F' -> Char.code c - Char.code 'A' + 10
306 | 'a'..'f' -> Char.code c - Char.code 'a' + 10
307 | _ -> 0
308 in
309 let buf = Buffer.create 32 in
310 while
311 match Input.peek t.input with
312 | Some '%' ->
313 (* Percent-encoded character *)
314 ignore (Input.next t.input);
315 (match Input.peek t.input, Input.peek_nth t.input 1 with
316 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 ->
317 ignore (Input.next t.input);
318 ignore (Input.next t.input);
319 let code = (hex_val c1) * 16 + (hex_val c2) in
320 Buffer.add_char buf (Char.chr code);
321 true
322 | _ ->
323 (* Invalid percent encoding - keep the % *)
324 Buffer.add_char buf '%';
325 true)
326 | Some c when not (Input.is_whitespace c) &&
327 not (Input.is_flow_indicator c) ->
328 Buffer.add_char buf c;
329 ignore (Input.next t.input);
330 true
331 | _ -> false
332 do () done;
333 Buffer.contents buf
334
335(** Scan a tag *)
336let scan_tag t =
337 let start = Input.mark t.input in
338 ignore (Input.next t.input); (* consume ! *)
339 let handle, suffix =
340 match Input.peek t.input with
341 | Some '<' ->
342 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *)
343 ignore (Input.next t.input);
344 let buf = Buffer.create 32 in
345 while
346 match Input.peek t.input with
347 | Some '>' -> false
348 | Some c ->
349 Buffer.add_char buf c;
350 ignore (Input.next t.input);
351 true
352 | None -> Error.raise_at (Input.mark t.input) (Invalid_tag "unclosed verbatim tag")
353 do () done;
354 ignore (Input.next t.input); (* consume > *)
355 ("", Buffer.contents buf)
356 | Some c when Input.is_whitespace c || Input.is_flow_indicator c ->
357 (* Non-specific tag: ! *)
358 ("!", "")
359 | Some '!' ->
360 (* Secondary handle: !! *)
361 ignore (Input.next t.input); (* consume second ! *)
362 let suffix = scan_tag_suffix t in
363 ("!!", suffix)
364 | _ ->
365 (* Primary handle or just suffix: !foo or !e!foo *)
366 (* Read alphanumeric characters *)
367 let buf = Buffer.create 16 in
368 while
369 match Input.peek t.input with
370 | Some c when Input.is_alnum c || c = '-' ->
371 Buffer.add_char buf c;
372 ignore (Input.next t.input);
373 true
374 | _ -> false
375 do () done;
376 (* Check if next character is ! - if so, this is a named handle *)
377 (match Input.peek t.input with
378 | Some '!' ->
379 (* Named handle like !e! *)
380 ignore (Input.next t.input);
381 let handle_name = Buffer.contents buf in
382 let suffix = scan_tag_suffix t in
383 ("!" ^ handle_name ^ "!", suffix)
384 | _ ->
385 (* Just ! followed by suffix *)
386 ("!", Buffer.contents buf ^ scan_tag_suffix t))
387 in
388 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *)
389 (match Input.peek t.input with
390 | None -> () (* EOF is ok *)
391 | Some c when Input.is_whitespace c || Input.is_break c -> ()
392 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> ()
393 | _ -> Error.raise_at start (Invalid_tag "expected whitespace or line break after tag"));
394 let span = Span.make ~start ~stop:(Input.mark t.input) in
395 (handle, suffix, span)
396
397(** Scan single-quoted scalar *)
398let scan_single_quoted t =
399 let start = Input.mark t.input in
400 ignore (Input.next t.input); (* consume opening single-quote *)
401 let buf = Buffer.create 64 in
402 let whitespace = Buffer.create 16 in (* Track trailing whitespace *)
403
404 let flush_whitespace () =
405 if Buffer.length whitespace > 0 then begin
406 Buffer.add_buffer buf whitespace;
407 Buffer.clear whitespace
408 end
409 in
410
411 let rec loop () =
412 match Input.peek t.input with
413 | None -> Error.raise_at start Unclosed_single_quote
414 | Some '\'' ->
415 ignore (Input.next t.input);
416 (* Check for escaped quote ('') *)
417 (match Input.peek t.input with
418 | Some '\'' ->
419 flush_whitespace ();
420 Buffer.add_char buf '\'';
421 ignore (Input.next t.input);
422 loop ()
423 | _ ->
424 (* End of string - flush any trailing whitespace *)
425 flush_whitespace ())
426 | Some ' ' | Some '\t' ->
427 (* Track whitespace - don't add to buf yet *)
428 Buffer.add_char whitespace (Option.get (Input.peek t.input));
429 ignore (Input.next t.input);
430 loop ()
431 | Some '\n' | Some '\r' ->
432 (* Discard trailing whitespace before line break *)
433 Buffer.clear whitespace;
434 Input.consume_break t.input;
435 (* Skip leading whitespace on next line *)
436 while Input.next_is_blank t.input do
437 ignore (Input.next t.input)
438 done;
439 (* Check for document boundary *)
440 if Input.at_document_boundary t.input then
441 Error.raise_at start Unclosed_single_quote;
442 (* Check indentation: continuation must be > block indent (QB6E, DK95) *)
443 let col = column t in
444 let indent = current_indent t in
445 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
446 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
447 (* Count empty lines (consecutive line breaks) *)
448 let empty_lines = ref 0 in
449 while Input.next_is_break t.input do
450 incr empty_lines;
451 Input.consume_break t.input;
452 while Input.next_is_blank t.input do
453 ignore (Input.next t.input)
454 done;
455 if Input.at_document_boundary t.input then
456 Error.raise_at start Unclosed_single_quote;
457 (* Check indentation after each empty line too *)
458 let col = column t in
459 let indent = current_indent t in
460 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
461 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar")
462 done;
463 (* Apply folding rules *)
464 if !empty_lines > 0 then begin
465 (* Empty lines: preserve as newlines *)
466 for _ = 1 to !empty_lines do
467 Buffer.add_char buf '\n'
468 done
469 end else
470 (* Single break: fold to space (even at start of string) *)
471 Buffer.add_char buf ' ';
472 loop ()
473 | Some c ->
474 flush_whitespace ();
475 Buffer.add_char buf c;
476 ignore (Input.next t.input);
477 loop ()
478 in
479 loop ();
480 let span = Span.make ~start ~stop:(Input.mark t.input) in
481 (Buffer.contents buf, span)
482
483(** Decode hex escape of given length *)
484let decode_hex t len =
485 let start = Input.mark t.input in
486 let buf = Buffer.create len in
487 for _ = 1 to len do
488 match Input.peek t.input with
489 | Some c when Input.is_hex c ->
490 Buffer.add_char buf c;
491 ignore (Input.next t.input)
492 | _ ->
493 Error.raise_at start (Invalid_hex_escape (Buffer.contents buf))
494 done;
495 let code = int_of_string ("0x" ^ Buffer.contents buf) in
496 if code <= 0x7F then
497 String.make 1 (Char.chr code)
498 else if code <= 0x7FF then
499 let b1 = 0xC0 lor (code lsr 6) in
500 let b2 = 0x80 lor (code land 0x3F) in
501 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2))
502 else if code <= 0xFFFF then
503 let b1 = 0xE0 lor (code lsr 12) in
504 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in
505 let b3 = 0x80 lor (code land 0x3F) in
506 String.init 3 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3))
507 else
508 let b1 = 0xF0 lor (code lsr 18) in
509 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in
510 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in
511 let b4 = 0x80 lor (code land 0x3F) in
512 String.init 4 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4))
513
514(** Scan double-quoted scalar *)
515let scan_double_quoted t =
516 let start = Input.mark t.input in
517 ignore (Input.next t.input); (* consume opening double-quote *)
518 let buf = Buffer.create 64 in
519 let whitespace = Buffer.create 16 in (* Track pending whitespace *)
520
521 let flush_whitespace () =
522 if Buffer.length whitespace > 0 then begin
523 Buffer.add_buffer buf whitespace;
524 Buffer.clear whitespace
525 end
526 in
527
528 let rec loop () =
529 match Input.peek t.input with
530 | None -> Error.raise_at start Unclosed_double_quote
531 | Some '"' ->
532 (* Flush trailing whitespace before closing quote to preserve it *)
533 flush_whitespace ();
534 ignore (Input.next t.input)
535 | Some ' ' | Some '\t' as c_opt ->
536 (* Track whitespace - don't add to buf yet *)
537 let c = match c_opt with Some c -> c | None -> assert false in
538 Buffer.add_char whitespace c;
539 ignore (Input.next t.input);
540 loop ()
541 | Some '\\' ->
542 (* Escape sequence - this is non-whitespace content *)
543 flush_whitespace (); (* Commit any pending whitespace *)
544 ignore (Input.next t.input);
545 (match Input.peek t.input with
546 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>")
547 | Some '0' -> Buffer.add_char buf '\x00'; ignore (Input.next t.input)
548 | Some 'a' -> Buffer.add_char buf '\x07'; ignore (Input.next t.input)
549 | Some 'b' -> Buffer.add_char buf '\x08'; ignore (Input.next t.input)
550 | Some 't' | Some '\t' -> Buffer.add_char buf '\t'; ignore (Input.next t.input)
551 | Some 'n' -> Buffer.add_char buf '\n'; ignore (Input.next t.input)
552 | Some 'v' -> Buffer.add_char buf '\x0B'; ignore (Input.next t.input)
553 | Some 'f' -> Buffer.add_char buf '\x0C'; ignore (Input.next t.input)
554 | Some 'r' -> Buffer.add_char buf '\r'; ignore (Input.next t.input)
555 | Some 'e' -> Buffer.add_char buf '\x1B'; ignore (Input.next t.input)
556 | Some ' ' -> Buffer.add_char buf ' '; ignore (Input.next t.input)
557 | Some '"' -> Buffer.add_char buf '"'; ignore (Input.next t.input)
558 | Some '/' -> Buffer.add_char buf '/'; ignore (Input.next t.input)
559 | Some '\\' -> Buffer.add_char buf '\\'; ignore (Input.next t.input)
560 | Some 'N' -> Buffer.add_string buf "\xC2\x85"; ignore (Input.next t.input) (* NEL *)
561 | Some '_' -> Buffer.add_string buf "\xC2\xA0"; ignore (Input.next t.input) (* NBSP *)
562 | Some 'L' -> Buffer.add_string buf "\xE2\x80\xA8"; ignore (Input.next t.input) (* LS *)
563 | Some 'P' -> Buffer.add_string buf "\xE2\x80\xA9"; ignore (Input.next t.input) (* PS *)
564 | Some 'x' ->
565 ignore (Input.next t.input);
566 Buffer.add_string buf (decode_hex t 2)
567 | Some 'u' ->
568 ignore (Input.next t.input);
569 Buffer.add_string buf (decode_hex t 4)
570 | Some 'U' ->
571 ignore (Input.next t.input);
572 Buffer.add_string buf (decode_hex t 8)
573 | Some '\n' | Some '\r' ->
574 (* Line continuation escape *)
575 Input.consume_break t.input;
576 while Input.next_is_blank t.input do
577 ignore (Input.next t.input)
578 done
579 | Some c ->
580 Error.raise_at (Input.mark t.input)
581 (Invalid_escape_sequence (Printf.sprintf "\\%c" c)));
582 loop ()
583 | Some '\n' | Some '\r' ->
584 (* Line break: discard any pending trailing whitespace *)
585 Buffer.clear whitespace;
586 Input.consume_break t.input;
587 (* Count consecutive line breaks (empty lines) *)
588 let empty_lines = ref 0 in
589 let continue = ref true in
590 let started_with_tab = ref false in
591 while !continue do
592 (* Track if we start with a tab (for DK95/01 check) *)
593 if Input.next_is (( = ) '\t') t.input then started_with_tab := true;
594 (* Skip blanks (spaces/tabs) on the line *)
595 while Input.next_is_blank t.input do
596 ignore (Input.next t.input)
597 done;
598 (* Check if we hit another line break (empty line) *)
599 if Input.next_is_break t.input then begin
600 Input.consume_break t.input;
601 incr empty_lines;
602 started_with_tab := false (* Reset for next line *)
603 end else
604 continue := false
605 done;
606 (* Check for document boundary - this terminates the quoted string *)
607 if Input.at_document_boundary t.input then
608 Error.raise_at start Unclosed_double_quote;
609 (* Check indentation: continuation must be > block indent (QB6E, DK95)
610 Note: must be strictly greater than block indent, not just equal *)
611 let col = column t in
612 let indent = current_indent t in
613 let start_col = start.column in
614 (* DK95/01: if continuation started with tabs and column < start column, error *)
615 if not (Input.is_eof t.input) && !started_with_tab && col < start_col then
616 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
617 if not (Input.is_eof t.input) && col <= indent && indent >= 0 then
618 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
619 (* Per YAML spec: single break = space, break + empty lines = newlines *)
620 if !empty_lines > 0 then begin
621 (* Empty lines: output N newlines where N = number of empty lines *)
622 for _ = 1 to !empty_lines do
623 Buffer.add_char buf '\n'
624 done
625 end else
626 (* Single break folds to space *)
627 Buffer.add_char buf ' ';
628 loop ()
629 | Some c ->
630 (* Non-whitespace character *)
631 flush_whitespace (); (* Commit any pending whitespace *)
632 Buffer.add_char buf c;
633 ignore (Input.next t.input);
634 loop ()
635 in
636 loop ();
637 let span = Span.make ~start ~stop:(Input.mark t.input) in
638 (Buffer.contents buf, span)
639
640(** Check if character can appear in plain scalar at this position *)
641let can_continue_plain t c ~in_flow =
642 match c with
643 | ':' ->
644 (* : is OK if not followed by whitespace or flow indicator *)
645 (match Input.peek_nth t.input 1 with
646 | None -> true
647 | Some c2 when Input.is_whitespace c2 -> false
648 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false
649 | _ -> true)
650 | '#' ->
651 (* # is a comment indicator only if preceded by whitespace *)
652 (* Check the previous character to determine if this is a comment *)
653 (match Input.peek_back t.input with
654 | None -> true (* At start - can't be comment indicator, allow it *)
655 | Some c when Input.is_whitespace c -> false (* Preceded by whitespace - comment *)
656 | Some c when Input.is_break c -> false (* At start of line - comment *)
657 | _ -> true) (* Not preceded by whitespace - part of scalar *)
658 | c when in_flow && Input.is_flow_indicator c -> false
659 | _ when Input.is_break c -> false
660 | _ -> true
661
662(** Scan plain scalar *)
663let scan_plain_scalar t =
664 let start = Input.mark t.input in
665 let in_flow = t.flow_level > 0 in
666 let indent = current_indent t in
667 (* In flow context, scalars must be indented more than the current block indent.
668 This ensures that content at block indent or less ends the flow context. *)
669 if in_flow && (column t - 1) < indent then
670 Error.raise_at start Invalid_flow_indentation;
671 let buf = Buffer.create 64 in
672 let spaces = Buffer.create 16 in
673 let whitespace = Buffer.create 16 in (* Track whitespace within a line *)
674 let leading_blanks = ref false in
675
676 let rec scan_line () =
677 match Input.peek t.input with
678 | None -> ()
679 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow ->
680 (* Blank character within a line - save to whitespace buffer *)
681 Buffer.add_char whitespace c;
682 ignore (Input.next t.input);
683 scan_line ()
684 | Some c when can_continue_plain t c ~in_flow ->
685 (* Non-blank character - process any pending breaks/whitespace first *)
686 begin
687 if Buffer.length spaces > 0 then begin
688 if !leading_blanks then begin
689 (* Fold line break *)
690 if Buffer.contents spaces = "\n" then
691 Buffer.add_char buf ' '
692 else begin
693 (* Multiple breaks - preserve all but first *)
694 let s = Buffer.contents spaces in
695 Buffer.add_substring buf s 1 (String.length s - 1)
696 end
697 end else
698 Buffer.add_buffer buf spaces;
699 Buffer.clear spaces
700 end;
701 (* Add any pending whitespace from within the line *)
702 if Buffer.length whitespace > 0 then begin
703 Buffer.add_buffer buf whitespace;
704 Buffer.clear whitespace
705 end;
706 (* Add the character *)
707 Buffer.add_char buf c;
708 ignore (Input.next t.input);
709 leading_blanks := false;
710 scan_line ()
711 end
712 | _ -> ()
713 in
714
715 let rec scan_lines () =
716 scan_line ();
717 (* Check for line continuation *)
718 if Input.next_is_break t.input then begin
719 (* Discard any trailing whitespace from the current line *)
720 Buffer.clear whitespace;
721 (* Save the line break *)
722 if !leading_blanks then begin
723 (* We already had a break - this is an additional break (empty line) *)
724 Buffer.add_char spaces '\n'
725 end else begin
726 (* First line break *)
727 Buffer.clear spaces;
728 Buffer.add_char spaces '\n';
729 leading_blanks := true
730 end;
731 Input.consume_break t.input;
732 (* Note: We do NOT set allow_simple_key here during plain scalar scanning.
733 Setting it here would incorrectly allow ':' that appears on a continuation
734 line to become a mapping indicator. The flag will be set properly after
735 the scalar ends and skip_to_next_token processes line breaks. *)
736 (* Skip leading blanks on the next line *)
737 while Input.next_is_blank t.input do
738 ignore (Input.next t.input)
739 done;
740 let col = (Input.position t.input).column in
741 (* Check indentation - stop if we're at or before the containing block's indent *)
742 (* However, allow empty lines (line breaks) to continue even if dedented *)
743 if Input.next_is_break t.input then
744 scan_lines () (* Empty line - continue *)
745 else if not in_flow && col <= indent then
746 () (* Stop - dedented or at parent level in block context *)
747 else if Input.at_document_boundary t.input then
748 () (* Stop - document boundary *)
749 else
750 scan_lines ()
751 end
752 in
753
754 scan_lines ();
755 let value = Buffer.contents buf in
756 (* Trim trailing whitespace (spaces and tabs) *)
757 let value =
758 let len = String.length value in
759 let rec find_end i =
760 if i < 0 then 0
761 else match value.[i] with
762 | ' ' | '\t' -> find_end (i - 1)
763 | _ -> i + 1
764 in
765 let end_pos = find_end (len - 1) in
766 String.sub value 0 end_pos
767 in
768 let span = Span.make ~start ~stop:(Input.mark t.input) in
769 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *)
770 (value, span, !leading_blanks)
771
772(** Scan block scalar (literal | or folded >) *)
773let scan_block_scalar t literal =
774 let start = Input.mark t.input in
775 ignore (Input.next t.input); (* consume | or > *)
776
777 (* Parse header: optional indentation indicator and chomping *)
778 let explicit_indent = ref None in
779 let chomping = ref Chomping.Clip in
780
781 (* First character of header *)
782 (match Input.peek t.input with
783 | Some c when Input.is_digit c && c <> '0' ->
784 explicit_indent := Some (Char.code c - Char.code '0');
785 ignore (Input.next t.input)
786 | Some '-' -> chomping := Chomping.Strip; ignore (Input.next t.input)
787 | Some '+' -> chomping := Chomping.Keep; ignore (Input.next t.input)
788 | _ -> ());
789
790 (* Second character of header *)
791 (match Input.peek t.input with
792 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None ->
793 explicit_indent := Some (Char.code c - Char.code '0');
794 ignore (Input.next t.input)
795 | Some '-' when !chomping = Chomping.Clip ->
796 chomping := Chomping.Strip; ignore (Input.next t.input)
797 | Some '+' when !chomping = Chomping.Clip ->
798 chomping := Chomping.Keep; ignore (Input.next t.input)
799 | _ -> ());
800
801 (* Skip whitespace and optional comment *)
802 skip_whitespace_and_comment t;
803
804 (* Consume line break *)
805 if Input.next_is_break t.input then
806 Input.consume_break t.input
807 else if not (Input.is_eof t.input) then
808 Error.raise_at (Input.mark t.input)
809 (Invalid_block_scalar_header "expected newline after header");
810
811 let base_indent = current_indent t in
812 (* base_indent is the indent level from the stack, -1 if empty.
813 It's used directly for comparisons in implicit indent case. *)
814 let content_indent = ref (
815 match !explicit_indent with
816 | Some n ->
817 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed.
818 content_indent = (base_indent - 1) + n, but at least n for document level. *)
819 let base_level = max 0 (base_indent - 1) in
820 base_level + n
821 | None -> 0 (* Will be determined by first non-empty line *)
822 ) in
823
824 let buf = Buffer.create 256 in
825 let trailing_breaks = Buffer.create 16 in
826 let leading_blank = ref false in (* Was the previous line "more indented"? *)
827 let max_empty_line_indent = ref 0 in (* Track max indent of empty lines before first content *)
828
829 (* Skip to content indentation, skipping empty lines.
830 Returns the number of spaces actually skipped (important for detecting dedentation). *)
831 let rec skip_to_content_indent () =
832 if !content_indent > 0 then begin
833 (* Explicit indent - skip up to content_indent spaces *)
834 let spaces_skipped = ref 0 in
835 while !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input do
836 incr spaces_skipped;
837 ignore (Input.next t.input)
838 done;
839
840 (* Check if this line is empty (only spaces/tabs until break/eof) *)
841 if Input.next_is_break t.input then begin
842 (* Empty line - record the break and continue *)
843 Buffer.add_char trailing_breaks '\n';
844 Input.consume_break t.input;
845 skip_to_content_indent ()
846 end else if !spaces_skipped < !content_indent then begin
847 (* Line starts with fewer spaces than content_indent - dedented *)
848 !spaces_skipped
849 end else if Input.next_is_blank t.input then begin
850 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line.
851 For literal scalars, whitespace-only lines ARE content (not empty).
852 For folded scalars, whitespace-only lines that are "more indented" are preserved. *)
853 if literal then
854 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *)
855 !content_indent
856 else begin
857 (* Folded: check if rest is only blanks *)
858 let idx = ref 0 in
859 while match Input.peek_nth t.input !idx with
860 | Some c when Input.is_blank c -> incr idx; true
861 | _ -> false
862 do () done;
863 match Input.peek_nth t.input (!idx) with
864 | None | Some '\n' | Some '\r' ->
865 (* Empty/whitespace-only line in folded - skip spaces *)
866 while Input.next_is_blank t.input do
867 ignore (Input.next t.input)
868 done;
869 Buffer.add_char trailing_breaks '\n';
870 Input.consume_break t.input;
871 skip_to_content_indent ()
872 | _ ->
873 (* Has non-whitespace content *)
874 !content_indent
875 end
876 end else
877 !content_indent
878 end else begin
879 (* Implicit indent - skip empty lines without consuming spaces.
880 Note: Only SPACES count as indentation. Tabs are content, not indentation.
881 So we only check for spaces when determining if a line is "empty". *)
882 if Input.next_is_break t.input then begin
883 Buffer.add_char trailing_breaks '\n';
884 Input.consume_break t.input;
885 skip_to_content_indent ()
886 end else if Input.next_is (( = ) ' ') t.input then begin
887 (* Check if line is empty (only spaces before break) *)
888 let idx = ref 0 in
889 while match Input.peek_nth t.input !idx with
890 | Some ' ' -> incr idx; true
891 | _ -> false
892 do () done;
893 match Input.peek_nth t.input (!idx) with
894 | None | Some '\n' | Some '\r' ->
895 (* Line has only spaces - empty line *)
896 (* Track max indent of empty lines for later validation *)
897 if !idx > !max_empty_line_indent then
898 max_empty_line_indent := !idx;
899 while Input.next_is (( = ) ' ') t.input do
900 ignore (Input.next t.input)
901 done;
902 Buffer.add_char trailing_breaks '\n';
903 Input.consume_break t.input;
904 skip_to_content_indent ()
905 | _ ->
906 (* Has content (including tabs which are content, not indentation) *)
907 0
908 end else if Input.next_is (( = ) '\t') t.input then begin
909 (* Tab at start of line in implicit indent mode - this is an error (Y79Y)
910 because tabs cannot be used as indentation in YAML *)
911 Error.raise_at (Input.mark t.input) Tab_in_indentation
912 end else
913 (* Not at break or space - other content character *)
914 0
915 end
916 in
917
918 (* Read content *)
919 let rec read_lines () =
920 let spaces_skipped = skip_to_content_indent () in
921
922 (* Check if we're at content *)
923 if Input.is_eof t.input then ()
924 else if Input.at_document_boundary t.input then ()
925 else begin
926 (* Count additional leading spaces beyond what was skipped *)
927 let extra_spaces = ref 0 in
928 while Input.next_is (( = ) ' ') t.input do
929 incr extra_spaces;
930 ignore (Input.next t.input)
931 done;
932
933 (* Calculate actual line indentation *)
934 let line_indent = spaces_skipped + !extra_spaces in
935
936 (* Determine content indent from first content line (implicit case) *)
937 let first_line = !content_indent = 0 in
938 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent.
939 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *)
940 let base_level = base_indent - 1 in
941 let should_process =
942 if !content_indent = 0 then begin
943 (* For implicit indent, content must be more indented than base_level. *)
944 if line_indent <= base_level then
945 false (* No content - first line not indented enough *)
946 else begin
947 (* Validate: first content line must be indented at least as much as
948 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *)
949 if line_indent < !max_empty_line_indent && line_indent > base_level then
950 Error.raise_at (Input.mark t.input)
951 (Invalid_block_scalar_header "wrongly indented line in block scalar");
952 content_indent := line_indent;
953 true
954 end
955 end else if line_indent < !content_indent then
956 false (* Dedented - done with content *)
957 else
958 true
959 in
960
961 if should_process then begin
962 (* Check if current line is "more indented" (has extra indent or starts with whitespace).
963 For folded scalars, lines that start with any whitespace (space or tab) after the
964 content indentation are "more indented" and preserve breaks.
965 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *)
966 let trailing_blank = line_indent > !content_indent || Input.next_is_blank t.input in
967
968 (* Add trailing breaks to buffer *)
969 if Buffer.length buf > 0 then begin
970 if Buffer.length trailing_breaks > 0 then begin
971 if literal then
972 Buffer.add_buffer buf trailing_breaks
973 else begin
974 (* Folded scalar: fold only if both previous and current lines are not more-indented *)
975 if not !leading_blank && not trailing_blank then begin
976 let breaks = Buffer.contents trailing_breaks in
977 if String.length breaks = 1 then
978 Buffer.add_char buf ' '
979 else
980 Buffer.add_substring buf breaks 1 (String.length breaks - 1)
981 end else begin
982 (* Preserve breaks for more-indented lines *)
983 Buffer.add_buffer buf trailing_breaks
984 end
985 end
986 end else if not literal then
987 Buffer.add_char buf ' '
988 end else
989 Buffer.add_buffer buf trailing_breaks;
990 Buffer.clear trailing_breaks;
991
992 (* Add extra indentation for literal or more-indented folded lines *)
993 (* On the first line (when determining content_indent), we've already consumed all spaces,
994 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *)
995 if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin
996 for _ = 1 to !extra_spaces do
997 Buffer.add_char buf ' '
998 done
999 end;
1000
1001 (* Read line content *)
1002 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
1003 Buffer.add_char buf (Input.next_exn t.input)
1004 done;
1005
1006 (* Record trailing break *)
1007 if Input.next_is_break t.input then begin
1008 Buffer.add_char trailing_breaks '\n';
1009 Input.consume_break t.input
1010 end;
1011
1012 (* Update leading_blank for next iteration *)
1013 leading_blank := trailing_blank;
1014
1015 read_lines ()
1016 end
1017 end
1018 in
1019
1020 read_lines ();
1021
1022 (* Apply chomping *)
1023 let value =
1024 let content = Buffer.contents buf in
1025 match !chomping with
1026 | Chomping.Strip -> content
1027 | Chomping.Clip ->
1028 if String.length content > 0 then content ^ "\n" else content
1029 | Chomping.Keep ->
1030 content ^ Buffer.contents trailing_breaks
1031 in
1032
1033 let span = Span.make ~start ~stop:(Input.mark t.input) in
1034 let style = if literal then `Literal else `Folded in
1035 (value, style, span)
1036
1037(** Scan directive (after %) *)
1038let scan_directive t =
1039 let start = Input.mark t.input in
1040 ignore (Input.next t.input); (* consume % *)
1041
1042 (* Read directive name *)
1043 let name_buf = Buffer.create 16 in
1044 while
1045 match Input.peek t.input with
1046 | Some c when Input.is_alnum c || c = '-' ->
1047 Buffer.add_char name_buf c;
1048 ignore (Input.next t.input);
1049 true
1050 | _ -> false
1051 do () done;
1052 let name = Buffer.contents name_buf in
1053
1054 (* Skip blanks *)
1055 while Input.next_is_blank t.input do
1056 ignore (Input.next t.input)
1057 done;
1058
1059 match name with
1060 | "YAML" ->
1061 (* Version directive: %YAML 1.2 *)
1062 let major = ref 0 in
1063 let minor = ref 0 in
1064 (* Read major version *)
1065 while Input.next_is_digit t.input do
1066 major := !major * 10 + (Char.code (Input.next_exn t.input) - Char.code '0')
1067 done;
1068 (* Expect . *)
1069 (match Input.peek t.input with
1070 | Some '.' -> ignore (Input.next t.input)
1071 | _ -> Error.raise_at (Input.mark t.input) (Invalid_yaml_version "expected '.'"));
1072 (* Read minor version *)
1073 while Input.next_is_digit t.input do
1074 minor := !minor * 10 + (Char.code (Input.next_exn t.input) - Char.code '0')
1075 done;
1076 (* Validate: only whitespace and comments allowed before line break (MUS6) *)
1077 skip_whitespace_and_comment t;
1078 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
1079 Error.raise_at (Input.mark t.input) (Invalid_directive "expected comment or line break after version");
1080 let span = Span.make ~start ~stop:(Input.mark t.input) in
1081 Token.Version_directive { major = !major; minor = !minor }, span
1082
1083 | "TAG" ->
1084 (* Tag directive: %TAG !foo! tag:example.com,2000: *)
1085 let handle = scan_tag_handle t in
1086 (* Skip blanks *)
1087 while Input.next_is_blank t.input do
1088 ignore (Input.next t.input)
1089 done;
1090 (* Read prefix *)
1091 let prefix_buf = Buffer.create 32 in
1092 while
1093 match Input.peek t.input with
1094 | Some c when not (Input.is_whitespace c) ->
1095 Buffer.add_char prefix_buf c;
1096 ignore (Input.next t.input);
1097 true
1098 | _ -> false
1099 do () done;
1100 let prefix = Buffer.contents prefix_buf in
1101 let span = Span.make ~start ~stop:(Input.mark t.input) in
1102 Token.Tag_directive { handle; prefix }, span
1103
1104 | _ ->
1105 (* Reserved/Unknown directive - skip to end of line and ignore *)
1106 (* Per YAML spec, reserved directives should be ignored with a warning *)
1107 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
1108 ignore (Input.next t.input)
1109 done;
1110 let span = Span.make ~start ~stop:(Input.mark t.input) in
1111 (* Return an empty tag directive token to indicate directive was processed but ignored *)
1112 Token.Tag_directive { handle = ""; prefix = "" }, span
1113
1114(** Fetch the next token(s) into the queue *)
1115let rec fetch_next_token t =
1116 skip_to_next_token t;
1117 stale_simple_keys t;
1118 let col = column t in
1119 (* Unroll indents that are deeper than current column.
1120 Note: we use col, not col-1, to allow entries at the same level. *)
1121 unroll_indent t col;
1122
1123 (* We're about to process actual content, not leading whitespace *)
1124 t.leading_whitespace <- false;
1125
1126 if Input.is_eof t.input then
1127 fetch_stream_end t
1128 else if Input.at_document_boundary t.input then
1129 fetch_document_indicator t
1130 else begin
1131 match Input.peek t.input with
1132 | None -> fetch_stream_end t
1133 | Some '%' when (Input.position t.input).column = 1 ->
1134 fetch_directive t
1135 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start
1136 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start
1137 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end
1138 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end
1139 | Some ',' -> fetch_flow_entry t
1140 | Some '-' when t.flow_level = 0 && check_block_entry t ->
1141 fetch_block_entry t
1142 | Some '?' when check_key t ->
1143 fetch_key t
1144 | Some ':' when check_value t ->
1145 fetch_value t
1146 | Some '*' -> fetch_alias t
1147 | Some '&' -> fetch_anchor t
1148 | Some '!' -> fetch_tag t
1149 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true
1150 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false
1151 | Some '\'' -> fetch_single_quoted t
1152 | Some '"' -> fetch_double_quoted t
1153 | Some '-' when can_start_plain t ->
1154 fetch_plain_scalar t
1155 | Some '?' when can_start_plain t ->
1156 fetch_plain_scalar t
1157 | Some ':' when can_start_plain t ->
1158 fetch_plain_scalar t
1159 | Some c when can_start_plain_char c t ->
1160 fetch_plain_scalar t
1161 | Some c ->
1162 Error.raise_at (Input.mark t.input) (Unexpected_character c)
1163 end
1164
1165and fetch_stream_end t =
1166 if not t.stream_ended then begin
1167 unroll_indent t (-1);
1168 remove_simple_key t;
1169 t.allow_simple_key <- false;
1170 t.stream_ended <- true;
1171 let span = Span.point (Input.mark t.input) in
1172 emit t span Token.Stream_end
1173 end
1174
1175and fetch_document_indicator t =
1176 unroll_indent t (-1);
1177 remove_simple_key t;
1178 t.allow_simple_key <- false;
1179 let start = Input.mark t.input in
1180 let indicator = Input.peek_string t.input 3 in
1181 Input.skip t.input 3;
1182 let span = Span.make ~start ~stop:(Input.mark t.input) in
1183 let token = if indicator = "---" then Token.Document_start else Token.Document_end in
1184 (* Reset document content flag after document end marker *)
1185 if indicator = "..." then begin
1186 t.document_has_content <- false;
1187 (* After document end marker, skip whitespace and check for end of line or comment *)
1188 while Input.next_is_blank t.input do ignore (Input.next t.input) done;
1189 (match Input.peek t.input with
1190 | None -> () (* EOF is ok *)
1191 | Some c when Input.is_break c -> ()
1192 | Some '#' -> () (* Comment is ok *)
1193 | _ -> Error.raise_at start (Invalid_directive "content not allowed after document end marker on same line"))
1194 end;
1195 emit t span token
1196
1197and fetch_directive t =
1198 (* Directives can only appear:
1199 1. At stream start (before any document content)
1200 2. After a document end marker (...)
1201 If we've emitted content in the current document, we need a document end marker first *)
1202 if t.document_has_content then
1203 Error.raise_at (Input.mark t.input)
1204 (Unexpected_token "directives must be separated from document content by document end marker (...)");
1205 unroll_indent t (-1);
1206 remove_simple_key t;
1207 t.allow_simple_key <- false;
1208 let token, span = scan_directive t in
1209 emit t span token
1210
1211and fetch_flow_collection_start t token_type =
1212 save_simple_key t;
1213 (* Record indent of outermost flow collection *)
1214 if t.flow_level = 0 then
1215 t.flow_indent <- column t;
1216 t.flow_level <- t.flow_level + 1;
1217 (* Track whether this is a mapping or sequence *)
1218 let is_mapping = (token_type = Token.Flow_mapping_start) in
1219 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack;
1220 t.allow_simple_key <- true;
1221 t.simple_keys <- None :: t.simple_keys;
1222 t.document_has_content <- true;
1223 let start = Input.mark t.input in
1224 ignore (Input.next t.input);
1225 let span = Span.make ~start ~stop:(Input.mark t.input) in
1226 emit t span token_type
1227
1228and fetch_flow_collection_end t token_type =
1229 remove_simple_key t;
1230 t.flow_level <- t.flow_level - 1;
1231 t.flow_mapping_stack <- (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []);
1232 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []);
1233 t.allow_simple_key <- false;
1234 let start = Input.mark t.input in
1235 ignore (Input.next t.input);
1236 (* Allow adjacent values after flow collection ends *)
1237 if t.flow_level > 0 then
1238 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1239 let span = Span.make ~start ~stop:(Input.mark t.input) in
1240 emit t span token_type
1241
1242and fetch_flow_entry t =
1243 remove_simple_key t;
1244 t.allow_simple_key <- true;
1245 let start = Input.mark t.input in
1246 ignore (Input.next t.input);
1247 let span = Span.make ~start ~stop:(Input.mark t.input) in
1248 emit t span Token.Flow_entry
1249
1250and check_block_entry t =
1251 (* - followed by whitespace or EOF *)
1252 match Input.peek_nth t.input 1 with
1253 | None -> true
1254 | Some c -> Input.is_whitespace c
1255
1256and fetch_block_entry t =
1257 if t.flow_level = 0 then begin
1258 (* Block entries require allow_simple_key to be true.
1259 This prevents block sequences on the same line as a mapping value,
1260 e.g., "key: - a" is invalid. *)
1261 if not t.allow_simple_key then
1262 Error.raise_at (Input.mark t.input) Block_sequence_disallowed;
1263 let col = column t in
1264 if roll_indent t col ~sequence:true then begin
1265 let span = Span.point (Input.mark t.input) in
1266 emit t span Token.Block_sequence_start
1267 end
1268 end;
1269 remove_simple_key t;
1270 t.allow_simple_key <- true;
1271 t.document_has_content <- true;
1272 let start = Input.mark t.input in
1273 ignore (Input.next t.input);
1274
1275 (* Check for tabs after - : pattern like -\t- is invalid *)
1276 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
1277 if found_tabs then begin
1278 (* If we found tabs and next char is - followed by whitespace, error *)
1279 match Input.peek t.input with
1280 | Some '-' ->
1281 (match Input.peek_nth t.input 1 with
1282 | None -> Error.raise_at start Tab_in_indentation
1283 | Some c when Input.is_whitespace c ->
1284 Error.raise_at start Tab_in_indentation
1285 | Some _ -> ())
1286 | _ -> ()
1287 end;
1288
1289 let span = Span.make ~start ~stop:(Input.mark t.input) in
1290 emit t span Token.Block_entry
1291
1292and check_key t =
1293 (* ? followed by whitespace or flow indicator in both block and flow *)
1294 match Input.peek_nth t.input 1 with
1295 | None -> true
1296 | Some c ->
1297 Input.is_whitespace c ||
1298 (t.flow_level > 0 && Input.is_flow_indicator c)
1299
1300and fetch_key t =
1301 if t.flow_level = 0 then begin
1302 if not t.allow_simple_key then
1303 Error.raise_at (Input.mark t.input) Expected_key;
1304 let col = column t in
1305 if roll_indent t col ~sequence:false then begin
1306 let span = Span.point (Input.mark t.input) in
1307 emit t span Token.Block_mapping_start
1308 end
1309 end;
1310 remove_simple_key t;
1311 t.allow_simple_key <- t.flow_level = 0;
1312 t.document_has_content <- true;
1313 let start = Input.mark t.input in
1314 ignore (Input.next t.input);
1315
1316 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *)
1317 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
1318 if found_tabs && t.flow_level = 0 then begin
1319 (* In block context, tabs after ? are not allowed *)
1320 Error.raise_at start Tab_in_indentation
1321 end;
1322
1323 let span = Span.make ~start ~stop:(Input.mark t.input) in
1324 emit t span Token.Key;
1325 t.pending_value <- true (* We've emitted a KEY, now waiting for VALUE *)
1326
1327and check_value t =
1328 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *)
1329 match Input.peek_nth t.input 1 with
1330 | None -> true
1331 | Some c ->
1332 Input.is_whitespace c ||
1333 (t.flow_level > 0 && Input.is_flow_indicator c) ||
1334 (* Allow adjacent values in flow context at designated positions *)
1335 (t.flow_level > 0 &&
1336 match t.adjacent_value_allowed_at with
1337 | Some pos -> pos.Position.line = (Input.position t.input).Position.line &&
1338 pos.Position.column = (Input.position t.input).Position.column
1339 | None -> false)
1340
1341and fetch_value t =
1342 let start = Input.mark t.input in
1343 (* Check for simple key *)
1344 let used_simple_key =
1345 match t.simple_keys with
1346 | Some sk :: _ when sk.sk_possible ->
1347 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line.
1348 In explicit flow mapping { }, key and : can span lines. *)
1349 let is_implicit_flow_mapping = match t.flow_mapping_stack with
1350 | false :: _ -> true (* false = we're in a sequence, so any mapping is implicit *)
1351 | _ -> false
1352 in
1353 if is_implicit_flow_mapping && sk.sk_position.line < (Input.position t.input).line then
1354 Error.raise_at start Illegal_flow_key_line;
1355 (* Insert KEY token before the simple key value *)
1356 let key_span = Span.point sk.sk_position in
1357 let key_token = { Token.token = Token.Key; span = key_span } in
1358 (* We need to insert at the right position *)
1359 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1360 Queue.clear t.tokens;
1361 let insert_pos = sk.sk_token_number - t.tokens_taken in
1362 Array.iteri (fun i tok ->
1363 if i = insert_pos then Queue.add key_token t.tokens;
1364 Queue.add tok t.tokens
1365 ) tokens;
1366 if insert_pos >= Array.length tokens then
1367 Queue.add key_token t.tokens;
1368 t.token_number <- t.token_number + 1;
1369 t.pending_value <- true; (* We've inserted a KEY token, now waiting for VALUE *)
1370 (* Roll indent for implicit block mapping *)
1371 if t.flow_level = 0 then begin
1372 let col = sk.sk_position.column in
1373 if roll_indent t col ~sequence:false then begin
1374 let span = Span.point sk.sk_position in
1375 (* Insert block mapping start before key *)
1376 let bm_token = { Token.token = Token.Block_mapping_start; span } in
1377 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1378 Queue.clear t.tokens;
1379 Array.iteri (fun i tok ->
1380 if i = insert_pos then Queue.add bm_token t.tokens;
1381 Queue.add tok t.tokens
1382 ) tokens;
1383 if insert_pos >= Array.length tokens then
1384 Queue.add bm_token t.tokens;
1385 t.token_number <- t.token_number + 1
1386 end
1387 end;
1388 t.simple_keys <- None :: (List.tl t.simple_keys);
1389 true
1390 | _ ->
1391 (* No simple key - this is a complex value (or empty key) *)
1392 if t.flow_level = 0 then begin
1393 if not t.allow_simple_key then
1394 Error.raise_at (Input.mark t.input) Expected_key;
1395 let col = column t in
1396 if roll_indent t col ~sequence:false then begin
1397 let span = Span.point (Input.mark t.input) in
1398 emit t span Token.Block_mapping_start
1399 end
1400 (* Note: We don't emit KEY here. Empty key handling is done by the parser,
1401 which emits empty scalar when it sees VALUE without preceding KEY. *)
1402 end;
1403 false
1404 in
1405 remove_simple_key t;
1406 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white"
1407 In flow context or after using a simple key, disallow simple keys *)
1408 t.allow_simple_key <- (not used_simple_key) && (t.flow_level = 0);
1409 t.document_has_content <- true;
1410 let start = Input.mark t.input in
1411 ignore (Input.next t.input);
1412
1413 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09)
1414 However, :\t bar (tab followed by space then content) is valid (6BCT) *)
1415 let (found_tabs, found_spaces) = skip_blanks_check_tabs t in
1416 if found_tabs && not found_spaces && t.flow_level = 0 then begin
1417 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *)
1418 match Input.peek t.input with
1419 | Some ('-' | '?') ->
1420 Error.raise_at start Tab_in_indentation
1421 | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ->
1422 (* Tab-only followed by alphanumeric - likely a key, which is invalid *)
1423 Error.raise_at start Tab_in_indentation
1424 | _ -> ()
1425 end;
1426
1427 (* Skip any comment that may follow the colon and whitespace *)
1428 skip_whitespace_and_comment t;
1429
1430 let span = Span.make ~start ~stop:(Input.mark t.input) in
1431 emit t span Token.Value;
1432 t.pending_value <- false (* We've emitted a VALUE, no longer pending *)
1433
1434and fetch_alias t =
1435 save_simple_key t;
1436 t.allow_simple_key <- false;
1437 t.document_has_content <- true;
1438 let start = Input.mark t.input in
1439 ignore (Input.next t.input); (* consume * *)
1440 let name, span = scan_anchor_alias t in
1441 let span = Span.make ~start ~stop:span.stop in
1442 emit t span (Token.Alias name)
1443
1444and fetch_anchor t =
1445 save_simple_key t;
1446 t.allow_simple_key <- false;
1447 t.document_has_content <- true;
1448 let start = Input.mark t.input in
1449 ignore (Input.next t.input); (* consume & *)
1450 let name, span = scan_anchor_alias t in
1451 let span = Span.make ~start ~stop:span.stop in
1452 emit t span (Token.Anchor name)
1453
1454and fetch_tag t =
1455 save_simple_key t;
1456 t.allow_simple_key <- false;
1457 t.document_has_content <- true;
1458 let handle, suffix, span = scan_tag t in
1459 emit t span (Token.Tag { handle; suffix })
1460
1461and fetch_block_scalar t literal =
1462 remove_simple_key t;
1463 t.allow_simple_key <- true;
1464 t.document_has_content <- true;
1465 let value, style, span = scan_block_scalar t literal in
1466 emit t span (Token.Scalar { style; value })
1467
1468and fetch_single_quoted t =
1469 save_simple_key t;
1470 t.allow_simple_key <- false;
1471 t.document_has_content <- true;
1472 let value, span = scan_single_quoted t in
1473 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
1474 skip_to_next_token t;
1475 if t.flow_level > 0 then
1476 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1477 emit t span (Token.Scalar { style = `Single_quoted; value })
1478
1479and fetch_double_quoted t =
1480 save_simple_key t;
1481 t.allow_simple_key <- false;
1482 t.document_has_content <- true;
1483 let value, span = scan_double_quoted t in
1484 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
1485 skip_to_next_token t;
1486 if t.flow_level > 0 then
1487 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1488 emit t span (Token.Scalar { style = `Double_quoted; value })
1489
1490and can_start_plain t =
1491 (* Check if - ? : can start a plain scalar *)
1492 match Input.peek_nth t.input 1 with
1493 | None -> false
1494 | Some c ->
1495 not (Input.is_whitespace c) &&
1496 (t.flow_level = 0 || not (Input.is_flow_indicator c))
1497
1498and can_start_plain_char c _t =
1499 (* Characters that can start a plain scalar *)
1500 if Input.is_whitespace c then false
1501 else if Input.is_indicator c then false
1502 else true
1503
1504and fetch_plain_scalar t =
1505 save_simple_key t;
1506 t.allow_simple_key <- false;
1507 t.document_has_content <- true;
1508 let value, span, ended_with_linebreak = scan_plain_scalar t in
1509 (* If the plain scalar ended after crossing a line break (leading_blanks = true),
1510 allow simple keys. This is important because the scanner already consumed the
1511 line break and leading whitespace when checking for continuation. *)
1512 if ended_with_linebreak then
1513 t.allow_simple_key <- true;
1514 emit t span (Token.Scalar { style = `Plain; value })
1515
1516(** Check if we need more tokens to resolve simple keys *)
1517let need_more_tokens t =
1518 if t.stream_ended then false
1519 else if Queue.is_empty t.tokens then true
1520 else
1521 (* Check if any simple key could affect the first queued token *)
1522 List.exists (function
1523 | Some sk when sk.sk_possible ->
1524 sk.sk_token_number >= t.tokens_taken
1525 | _ -> false
1526 ) t.simple_keys
1527
1528(** Ensure we have enough tokens to return one safely *)
1529let ensure_tokens t =
1530 if not t.stream_started then begin
1531 t.stream_started <- true;
1532 let span = Span.point (Input.position t.input) in
1533 let encoding, _ = Encoding.detect (Input.source t.input) in
1534 emit t span (Token.Stream_start encoding)
1535 end;
1536 while need_more_tokens t do
1537 fetch_next_token t
1538 done
1539
1540(** Get next token *)
1541let next t =
1542 ensure_tokens t;
1543 if Queue.is_empty t.tokens then
1544 None
1545 else begin
1546 t.tokens_taken <- t.tokens_taken + 1;
1547 Some (Queue.pop t.tokens)
1548 end
1549
1550(** Peek at next token *)
1551let peek t =
1552 ensure_tokens t;
1553 Queue.peek_opt t.tokens
1554
1555(** Iterate over all tokens *)
1556let iter f t =
1557 let rec loop () =
1558 match next t with
1559 | None -> ()
1560 | Some tok -> f tok; loop ()
1561 in
1562 loop ()
1563
1564(** Fold over all tokens *)
1565let fold f init t =
1566 let rec loop acc =
1567 match next t with
1568 | None -> acc
1569 | Some tok -> loop (f acc tok)
1570 in
1571 loop init
1572
1573(** Convert to list *)
1574let to_list t =
1575 fold (fun acc tok -> tok :: acc) [] t |> List.rev