Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *)
7
8type simple_key = {
9 sk_possible : bool;
10 sk_required : bool;
11 sk_token_number : int;
12 sk_position : Position.t;
13}
14(** Simple key tracking for mapping key disambiguation *)
15
16type indent = {
17 indent : int;
18 needs_block_end : bool;
19}
20(** Indent level tracking *)
21
22type t = {
23 input : Input.t;
24 tokens : Token.spanned Queue.t;
25 mutable token_number : int;
26 mutable tokens_taken : int;
27 mutable stream_started : bool;
28 mutable stream_ended : bool;
29 mutable indent_stack : indent list; (** Stack of indentation levels *)
30 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *)
31 mutable flow_indent : int;
32 (** Column where outermost flow collection started *)
33 mutable simple_keys : simple_key option list;
34 (** Per flow-level simple key tracking *)
35 mutable allow_simple_key : bool;
36 mutable leading_whitespace : bool;
37 (** True when at start of line (only whitespace seen) *)
38 mutable document_has_content : bool;
39 (** True if we've emitted content tokens in current document *)
40 mutable adjacent_value_allowed_at : Position.t option;
41 (** Position where adjacent : is allowed *)
42 mutable flow_mapping_stack : bool list;
43 (** Stack of whether each flow level is a mapping *)
44}
45
46let create input =
47 {
48 input;
49 tokens = Queue.create ();
50 token_number = 0;
51 tokens_taken = 0;
52 stream_started = false;
53 stream_ended = false;
54 indent_stack = [];
55 flow_level = 0;
56 flow_indent = 0;
57 simple_keys = [ None ];
58 (* One entry for the base level *)
59 allow_simple_key = true;
60 leading_whitespace = true;
61 (* Start at beginning of stream *)
62 document_has_content = false;
63 adjacent_value_allowed_at = None;
64 flow_mapping_stack = [];
65 }
66
67let of_string s = create (Input.of_string s)
68let of_input = create
69let of_reader r = create (Input.of_reader r)
70let position t = Input.position t.input
71
72(** Add a token to the queue *)
73let emit t span token =
74 Queue.add { Token.token; span } t.tokens;
75 t.token_number <- t.token_number + 1
76
77(** Get current column (1-indexed) *)
78let column t = (Input.position t.input).column
79
80(** Get current indent level *)
81let current_indent t =
82 match t.indent_stack with [] -> -1 | { indent; _ } :: _ -> indent
83
84(** Skip whitespace to end of line, checking for valid comments. Returns true if
85 any whitespace (including tabs) was found before a comment. *)
86let skip_whitespace_and_comment t =
87 let has_whitespace = ref false in
88 (* Skip blanks (spaces and tabs) *)
89 while Input.next_is_blank t.input do
90 has_whitespace := true;
91 ignore (Input.next t.input)
92 done;
93 (* Check for comment *)
94 if Input.next_is (( = ) '#') t.input then begin
95 (* Validate: comment must be preceded by whitespace or be at start of line *)
96 if not !has_whitespace then begin
97 (* Check if we're at the start of input or after whitespace (blank or line break) *)
98 match Input.peek_back t.input with
99 | None -> () (* Start of input - OK *)
100 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *)
101 | _ ->
102 (* Comment not preceded by whitespace - ERROR *)
103 Error.raise_at (Input.mark t.input) Invalid_comment
104 end;
105 (* Skip to end of line *)
106 while (not (Input.is_eof t.input)) && not (Input.next_is_break t.input) do
107 ignore (Input.next t.input)
108 done
109 end
110
111(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *)
112let skip_blanks_check_tabs t =
113 let found_tab = ref false in
114 let found_space = ref false in
115 while Input.next_is_blank t.input do
116 (match Input.peek t.input with
117 | Some '\t' -> found_tab := true
118 | Some ' ' -> found_space := true
119 | _ -> ());
120 ignore (Input.next t.input)
121 done;
122 (!found_tab, !found_space)
123
124(** Skip whitespace and comments, return true if at newline *)
125let rec skip_to_next_token t =
126 (* Check for tabs used as indentation in block context *)
127 (match Input.peek t.input with
128 | Some '\t'
129 when t.flow_level = 0 && t.leading_whitespace
130 && column t - 1 < current_indent t ->
131 (* Tab found in indentation zone - this is invalid *)
132 (* Skip to end of line to check if line has content *)
133 let start_pos = Input.mark t.input in
134 while Input.next_is_blank t.input do
135 ignore (Input.next t.input)
136 done;
137 (* If we have content on this line with a tab, raise error *)
138 if (not (Input.next_is_break t.input)) && not (Input.is_eof t.input) then
139 Error.raise_at start_pos Tab_in_indentation
140 | _ -> ());
141
142 (* Skip blanks and validate comments *)
143 skip_whitespace_and_comment t;
144 (* Skip line break in block context *)
145 if t.flow_level = 0 && Input.next_is_break t.input then begin
146 Input.consume_break t.input;
147 t.allow_simple_key <- true;
148 t.leading_whitespace <- true;
149 skip_to_next_token t
150 end
151 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
152 (* In flow context, skip all whitespace including line breaks *)
153 if Input.next_is_break t.input then begin
154 Input.consume_break t.input;
155 (* Allow simple keys after line breaks in flow context *)
156 t.allow_simple_key <- true;
157 (* After line break in flow, check for tabs at start of line (Y79Y/03)
158 Tabs are not allowed as indentation - if tab is first char and results
159 in a column less than flow_indent, it's an error *)
160 if Input.next_is (( = ) '\t') t.input then begin
161 (* Tab at start of line in flow context - skip tabs and check position *)
162 let start_mark = Input.mark t.input in
163 while Input.next_is (( = ) '\t') t.input do
164 ignore (Input.next t.input)
165 done;
166 (* If only tabs were used (no spaces) and column < flow_indent, error *)
167 if
168 (not (Input.next_is_break t.input))
169 && (not (Input.is_eof t.input))
170 && column t < t.flow_indent
171 then Error.raise_at start_mark Invalid_flow_indentation
172 end;
173 skip_to_next_token t
174 end
175 else begin
176 ignore (Input.next t.input);
177 skip_to_next_token t
178 end
179 end
180
181(** Roll the indentation level *)
182let roll_indent t col =
183 if t.flow_level = 0 && col > current_indent t then begin
184 t.indent_stack <- { indent = col; needs_block_end = true } :: t.indent_stack;
185 true
186 end
187 else false
188
189(** Unroll indentation to given column *)
190let unroll_indent t col =
191 while
192 t.flow_level = 0
193 &&
194 match t.indent_stack with
195 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true
196 | _ -> false
197 do
198 match t.indent_stack with
199 | { indent = _; needs_block_end = true; _ } :: rest ->
200 let pos = Input.position t.input in
201 let span = Span.point pos in
202 emit t span Token.Block_end;
203 t.indent_stack <- rest
204 | _ -> ()
205 done
206
207(** Save a potential simple key *)
208let save_simple_key t =
209 if t.allow_simple_key then begin
210 (* A simple key is required only if we're in a block context,
211 at the current indentation level, AND the current indent needs a block end.
212 This matches saphyr's logic and prevents false positives for values. *)
213 let required =
214 t.flow_level = 0
215 &&
216 match t.indent_stack with
217 | { indent; needs_block_end = true; _ } :: _ -> indent = column t
218 | _ -> false
219 in
220 let sk =
221 {
222 sk_possible = true;
223 sk_required = required;
224 sk_token_number = t.token_number;
225 sk_position = Input.position t.input;
226 }
227 in
228 (* Remove any existing simple key at current level *)
229 t.simple_keys <-
230 (match t.simple_keys with
231 | _ :: rest -> Some sk :: rest
232 | [] -> [ Some sk ])
233 end
234
235(** Remove simple key at current level *)
236let remove_simple_key t =
237 match t.simple_keys with
238 | Some sk :: _rest when sk.sk_required ->
239 Error.raise_at sk.sk_position Expected_key
240 | _ :: rest -> t.simple_keys <- None :: rest
241 | [] -> ()
242
243(** Stale simple keys that span too many tokens *)
244let stale_simple_keys t =
245 t.simple_keys <-
246 List.map
247 (fun sk_opt ->
248 match sk_opt with
249 | Some sk
250 when sk.sk_possible
251 && (Input.position t.input).line > sk.sk_position.line
252 && t.flow_level = 0 ->
253 if sk.sk_required then Error.raise_at sk.sk_position Expected_key;
254 None
255 | _ -> sk_opt)
256 t.simple_keys
257
258(** Read anchor or alias name *)
259let scan_anchor_alias t =
260 let start = Input.mark t.input in
261 let buf = Buffer.create 16 in
262 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT:
263 - Whitespace (space, tab, line breaks)
264 - Flow indicators: []{}
265 - Comma (,)
266 This matches the saphyr implementation: is_yaml_non_space && !is_flow *)
267 while
268 match Input.peek t.input with
269 | Some c
270 when (not (Input.is_whitespace c))
271 && (not (Input.is_flow_indicator c))
272 && c <> '\x00' ->
273 Buffer.add_char buf c;
274 ignore (Input.next t.input);
275 true
276 | _ -> false
277 do
278 ()
279 done;
280 let name = Buffer.contents buf in
281 if String.length name = 0 then
282 Error.raise_at start (Invalid_anchor "empty anchor name");
283 (name, Span.make ~start ~stop:(Input.mark t.input))
284
285(** Scan tag handle *)
286let scan_tag_handle t =
287 let start = Input.mark t.input in
288 let buf = Buffer.create 16 in
289 (* Expect ! *)
290 (match Input.peek t.input with
291 | Some '!' ->
292 Buffer.add_char buf '!';
293 ignore (Input.next t.input)
294 | _ -> Error.raise_at start (Invalid_tag "expected '!'"));
295 (* Read word chars *)
296 while
297 match Input.peek t.input with
298 | Some c when Input.is_alnum c || c = '-' ->
299 Buffer.add_char buf c;
300 ignore (Input.next t.input);
301 true
302 | _ -> false
303 do
304 ()
305 done;
306 (* Check for secondary ! *)
307 (match Input.peek t.input with
308 | Some '!' ->
309 Buffer.add_char buf '!';
310 ignore (Input.next t.input)
311 | _ -> ());
312 Buffer.contents buf
313
314(** Scan tag suffix (after handle) *)
315let scan_tag_suffix t =
316 let is_hex_digit c =
317 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
318 in
319 let hex_val c =
320 match c with
321 | '0' .. '9' -> Char.code c - Char.code '0'
322 | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10
323 | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10
324 | _ -> 0
325 in
326 let buf = Buffer.create 32 in
327 while
328 match Input.peek t.input with
329 | Some '%' -> (
330 (* Percent-encoded character *)
331 ignore (Input.next t.input);
332 match (Input.peek t.input, Input.peek_nth t.input 1) with
333 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 ->
334 ignore (Input.next t.input);
335 ignore (Input.next t.input);
336 let code = (hex_val c1 * 16) + hex_val c2 in
337 Buffer.add_char buf (Char.chr code);
338 true
339 | _ ->
340 (* Invalid percent encoding - keep the % *)
341 Buffer.add_char buf '%';
342 true)
343 | Some c
344 when (not (Input.is_whitespace c)) && not (Input.is_flow_indicator c) ->
345 Buffer.add_char buf c;
346 ignore (Input.next t.input);
347 true
348 | _ -> false
349 do
350 ()
351 done;
352 Buffer.contents buf
353
354(** Scan a tag *)
355let scan_tag t =
356 let start = Input.mark t.input in
357 ignore (Input.next t.input);
358 (* consume ! *)
359 let handle, suffix =
360 match Input.peek t.input with
361 | Some '<' ->
362 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *)
363 ignore (Input.next t.input);
364 let buf = Buffer.create 32 in
365 while
366 match Input.peek t.input with
367 | Some '>' -> false
368 | Some c ->
369 Buffer.add_char buf c;
370 ignore (Input.next t.input);
371 true
372 | None ->
373 Error.raise_at (Input.mark t.input)
374 (Invalid_tag "unclosed verbatim tag")
375 do
376 ()
377 done;
378 ignore (Input.next t.input);
379 (* consume > *)
380 ("", Buffer.contents buf)
381 | Some c when Input.is_whitespace c || Input.is_flow_indicator c ->
382 (* Non-specific tag: ! *)
383 ("!", "")
384 | Some '!' ->
385 (* Secondary handle: !! *)
386 ignore (Input.next t.input);
387 (* consume second ! *)
388 let suffix = scan_tag_suffix t in
389 ("!!", suffix)
390 | _ -> (
391 (* Primary handle or just suffix: !foo or !e!foo *)
392 (* Read alphanumeric characters *)
393 let buf = Buffer.create 16 in
394 while
395 match Input.peek t.input with
396 | Some c when Input.is_alnum c || c = '-' ->
397 Buffer.add_char buf c;
398 ignore (Input.next t.input);
399 true
400 | _ -> false
401 do
402 ()
403 done;
404 (* Check if next character is ! - if so, this is a named handle *)
405 match Input.peek t.input with
406 | Some '!' ->
407 (* Named handle like !e! *)
408 ignore (Input.next t.input);
409 let handle_name = Buffer.contents buf in
410 let suffix = scan_tag_suffix t in
411 ("!" ^ handle_name ^ "!", suffix)
412 | _ ->
413 (* Just ! followed by suffix *)
414 ("!", Buffer.contents buf ^ scan_tag_suffix t))
415 in
416 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *)
417 (match Input.peek t.input with
418 | None -> () (* EOF is ok *)
419 | Some c when Input.is_whitespace c || Input.is_break c -> ()
420 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> ()
421 | _ ->
422 Error.raise_at start
423 (Invalid_tag "expected whitespace or line break after tag"));
424 let span = Span.make ~start ~stop:(Input.mark t.input) in
425 (handle, suffix, span)
426
427(** Scan single-quoted scalar *)
428let scan_single_quoted t =
429 let start = Input.mark t.input in
430 ignore (Input.next t.input);
431 (* consume opening single-quote *)
432 let buf = Buffer.create 64 in
433 let whitespace = Buffer.create 16 in
434 (* Track trailing whitespace *)
435
436 let flush_whitespace () =
437 if Buffer.length whitespace > 0 then begin
438 Buffer.add_buffer buf whitespace;
439 Buffer.clear whitespace
440 end
441 in
442
443 let rec loop () =
444 match Input.peek t.input with
445 | None -> Error.raise_at start Unclosed_single_quote
446 | Some '\'' -> (
447 ignore (Input.next t.input);
448 (* Check for escaped quote ('') *)
449 match Input.peek t.input with
450 | Some '\'' ->
451 flush_whitespace ();
452 Buffer.add_char buf '\'';
453 ignore (Input.next t.input);
454 loop ()
455 | _ ->
456 (* End of string - flush any trailing whitespace *)
457 flush_whitespace ())
458 | Some ' ' | Some '\t' ->
459 (* Track whitespace - don't add to buf yet *)
460 Buffer.add_char whitespace (Option.get (Input.peek t.input));
461 ignore (Input.next t.input);
462 loop ()
463 | Some '\n' | Some '\r' ->
464 (* Discard trailing whitespace before line break *)
465 Buffer.clear whitespace;
466 Input.consume_break t.input;
467 (* Skip leading whitespace on next line *)
468 while Input.next_is_blank t.input do
469 ignore (Input.next t.input)
470 done;
471 (* Check for document boundary *)
472 if Input.at_document_boundary t.input then
473 Error.raise_at start Unclosed_single_quote;
474 (* Check indentation: continuation must be > block indent (QB6E, DK95) *)
475 let col = column t in
476 let indent = current_indent t in
477 if
478 (not (Input.is_eof t.input))
479 && (not (Input.next_is_break t.input))
480 && col <= indent && indent >= 0
481 then
482 Error.raise_at (Input.mark t.input)
483 (Invalid_quoted_scalar_indentation
484 "invalid indentation in quoted scalar");
485 (* Count empty lines (consecutive line breaks) *)
486 let empty_lines = ref 0 in
487 while Input.next_is_break t.input do
488 incr empty_lines;
489 Input.consume_break t.input;
490 while Input.next_is_blank t.input do
491 ignore (Input.next t.input)
492 done;
493 if Input.at_document_boundary t.input then
494 Error.raise_at start Unclosed_single_quote;
495 (* Check indentation after each empty line too *)
496 let col = column t in
497 let indent = current_indent t in
498 if
499 (not (Input.is_eof t.input))
500 && (not (Input.next_is_break t.input))
501 && col <= indent && indent >= 0
502 then
503 Error.raise_at (Input.mark t.input)
504 (Invalid_quoted_scalar_indentation
505 "invalid indentation in quoted scalar")
506 done;
507 (* Apply folding rules *)
508 if !empty_lines > 0 then begin
509 (* Empty lines: preserve as newlines *)
510 for _ = 1 to !empty_lines do
511 Buffer.add_char buf '\n'
512 done
513 end
514 else
515 (* Single break: fold to space (even at start of string) *)
516 Buffer.add_char buf ' ';
517 loop ()
518 | Some c ->
519 flush_whitespace ();
520 Buffer.add_char buf c;
521 ignore (Input.next t.input);
522 loop ()
523 in
524 loop ();
525 let span = Span.make ~start ~stop:(Input.mark t.input) in
526 (Buffer.contents buf, span)
527
528(** Decode hex escape of given length *)
529let decode_hex t len =
530 let start = Input.mark t.input in
531 let buf = Buffer.create len in
532 for _ = 1 to len do
533 match Input.peek t.input with
534 | Some c when Input.is_hex c ->
535 Buffer.add_char buf c;
536 ignore (Input.next t.input)
537 | _ -> Error.raise_at start (Invalid_hex_escape (Buffer.contents buf))
538 done;
539 let code = int_of_string ("0x" ^ Buffer.contents buf) in
540 if code <= 0x7F then String.make 1 (Char.chr code)
541 else if code <= 0x7FF then
542 let b1 = 0xC0 lor (code lsr 6) in
543 let b2 = 0x80 lor (code land 0x3F) in
544 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2))
545 else if code <= 0xFFFF then
546 let b1 = 0xE0 lor (code lsr 12) in
547 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in
548 let b3 = 0x80 lor (code land 0x3F) in
549 String.init 3 (fun i ->
550 Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3))
551 else
552 let b1 = 0xF0 lor (code lsr 18) in
553 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in
554 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in
555 let b4 = 0x80 lor (code land 0x3F) in
556 String.init 4 (fun i ->
557 Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4))
558
559(** Scan double-quoted scalar *)
560let scan_double_quoted t =
561 let start = Input.mark t.input in
562 ignore (Input.next t.input);
563 (* consume opening double-quote *)
564 let buf = Buffer.create 64 in
565 let whitespace = Buffer.create 16 in
566 (* Track pending whitespace *)
567
568 let flush_whitespace () =
569 if Buffer.length whitespace > 0 then begin
570 Buffer.add_buffer buf whitespace;
571 Buffer.clear whitespace
572 end
573 in
574
575 let rec loop () =
576 match Input.peek t.input with
577 | None -> Error.raise_at start Unclosed_double_quote
578 | Some '"' ->
579 (* Flush trailing whitespace before closing quote to preserve it *)
580 flush_whitespace ();
581 ignore (Input.next t.input)
582 | (Some ' ' | Some '\t') as c_opt ->
583 (* Track whitespace - don't add to buf yet *)
584 let c = match c_opt with Some c -> c | None -> assert false in
585 Buffer.add_char whitespace c;
586 ignore (Input.next t.input);
587 loop ()
588 | Some '\\' ->
589 (* Escape sequence - this is non-whitespace content *)
590 flush_whitespace ();
591 (* Commit any pending whitespace *)
592 ignore (Input.next t.input);
593 (match Input.peek t.input with
594 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>")
595 | Some '0' ->
596 Buffer.add_char buf '\x00';
597 ignore (Input.next t.input)
598 | Some 'a' ->
599 Buffer.add_char buf '\x07';
600 ignore (Input.next t.input)
601 | Some 'b' ->
602 Buffer.add_char buf '\x08';
603 ignore (Input.next t.input)
604 | Some 't' | Some '\t' ->
605 Buffer.add_char buf '\t';
606 ignore (Input.next t.input)
607 | Some 'n' ->
608 Buffer.add_char buf '\n';
609 ignore (Input.next t.input)
610 | Some 'v' ->
611 Buffer.add_char buf '\x0B';
612 ignore (Input.next t.input)
613 | Some 'f' ->
614 Buffer.add_char buf '\x0C';
615 ignore (Input.next t.input)
616 | Some 'r' ->
617 Buffer.add_char buf '\r';
618 ignore (Input.next t.input)
619 | Some 'e' ->
620 Buffer.add_char buf '\x1B';
621 ignore (Input.next t.input)
622 | Some ' ' ->
623 Buffer.add_char buf ' ';
624 ignore (Input.next t.input)
625 | Some '"' ->
626 Buffer.add_char buf '"';
627 ignore (Input.next t.input)
628 | Some '/' ->
629 Buffer.add_char buf '/';
630 ignore (Input.next t.input)
631 | Some '\\' ->
632 Buffer.add_char buf '\\';
633 ignore (Input.next t.input)
634 | Some 'N' ->
635 Buffer.add_string buf "\xC2\x85";
636 ignore (Input.next t.input) (* NEL *)
637 | Some '_' ->
638 Buffer.add_string buf "\xC2\xA0";
639 ignore (Input.next t.input) (* NBSP *)
640 | Some 'L' ->
641 Buffer.add_string buf "\xE2\x80\xA8";
642 ignore (Input.next t.input) (* LS *)
643 | Some 'P' ->
644 Buffer.add_string buf "\xE2\x80\xA9";
645 ignore (Input.next t.input) (* PS *)
646 | Some 'x' ->
647 ignore (Input.next t.input);
648 Buffer.add_string buf (decode_hex t 2)
649 | Some 'u' ->
650 ignore (Input.next t.input);
651 Buffer.add_string buf (decode_hex t 4)
652 | Some 'U' ->
653 ignore (Input.next t.input);
654 Buffer.add_string buf (decode_hex t 8)
655 | Some '\n' | Some '\r' ->
656 (* Line continuation escape *)
657 Input.consume_break t.input;
658 while Input.next_is_blank t.input do
659 ignore (Input.next t.input)
660 done
661 | Some c ->
662 Error.raise_at (Input.mark t.input)
663 (Invalid_escape_sequence (Printf.sprintf "\\%c" c)));
664 loop ()
665 | Some '\n' | Some '\r' ->
666 (* Line break: discard any pending trailing whitespace *)
667 Buffer.clear whitespace;
668 Input.consume_break t.input;
669 (* Count consecutive line breaks (empty lines) *)
670 let empty_lines = ref 0 in
671 let continue = ref true in
672 let started_with_tab = ref false in
673 while !continue do
674 (* Track if we start with a tab (for DK95/01 check) *)
675 if Input.next_is (( = ) '\t') t.input then started_with_tab := true;
676 (* Skip blanks (spaces/tabs) on the line *)
677 while Input.next_is_blank t.input do
678 ignore (Input.next t.input)
679 done;
680 (* Check if we hit another line break (empty line) *)
681 if Input.next_is_break t.input then begin
682 Input.consume_break t.input;
683 incr empty_lines;
684 started_with_tab := false (* Reset for next line *)
685 end
686 else continue := false
687 done;
688 (* Check for document boundary - this terminates the quoted string *)
689 if Input.at_document_boundary t.input then
690 Error.raise_at start Unclosed_double_quote;
691 (* Check indentation: continuation must be > block indent (QB6E, DK95)
692 Note: must be strictly greater than block indent, not just equal *)
693 let col = column t in
694 let indent = current_indent t in
695 let start_col = start.column in
696 (* DK95/01: if continuation started with tabs and column < start column, error *)
697 if (not (Input.is_eof t.input)) && !started_with_tab && col < start_col
698 then
699 Error.raise_at (Input.mark t.input)
700 (Invalid_quoted_scalar_indentation
701 "invalid indentation in quoted scalar");
702 if (not (Input.is_eof t.input)) && col <= indent && indent >= 0 then
703 Error.raise_at (Input.mark t.input)
704 (Invalid_quoted_scalar_indentation
705 "invalid indentation in quoted scalar");
706 (* Per YAML spec: single break = space, break + empty lines = newlines *)
707 if !empty_lines > 0 then begin
708 (* Empty lines: output N newlines where N = number of empty lines *)
709 for _ = 1 to !empty_lines do
710 Buffer.add_char buf '\n'
711 done
712 end
713 else
714 (* Single break folds to space *)
715 Buffer.add_char buf ' ';
716 loop ()
717 | Some c ->
718 (* Non-whitespace character *)
719 flush_whitespace ();
720 (* Commit any pending whitespace *)
721 Buffer.add_char buf c;
722 ignore (Input.next t.input);
723 loop ()
724 in
725 loop ();
726 let span = Span.make ~start ~stop:(Input.mark t.input) in
727 (Buffer.contents buf, span)
728
729(** Check if character can appear in plain scalar at this position *)
730let can_continue_plain t c ~in_flow =
731 match c with
732 | ':' -> (
733 (* : is OK if not followed by whitespace or flow indicator *)
734 match Input.peek_nth t.input 1 with
735 | None -> true
736 | Some c2 when Input.is_whitespace c2 -> false
737 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false
738 | _ -> true)
739 | '#' -> (
740 (* # is a comment indicator only if preceded by whitespace *)
741 (* Check the previous character to determine if this is a comment *)
742 match Input.peek_back t.input with
743 | None -> true (* At start - can't be comment indicator, allow it *)
744 | Some c when Input.is_whitespace c ->
745 false (* Preceded by whitespace - comment *)
746 | Some c when Input.is_break c -> false (* At start of line - comment *)
747 | _ -> true (* Not preceded by whitespace - part of scalar *))
748 | c when in_flow && Input.is_flow_indicator c -> false
749 | _ when Input.is_break c -> false
750 | _ -> true
751
752(** Scan plain scalar *)
753let scan_plain_scalar t =
754 let start = Input.mark t.input in
755 let in_flow = t.flow_level > 0 in
756 let indent = current_indent t in
757 (* In flow context, scalars must be indented more than the current block indent.
758 This ensures that content at block indent or less ends the flow context. *)
759 if in_flow && column t - 1 < indent then
760 Error.raise_at start Invalid_flow_indentation;
761 let buf = Buffer.create 64 in
762 let spaces = Buffer.create 16 in
763 let whitespace = Buffer.create 16 in
764 (* Track whitespace within a line *)
765 let leading_blanks = ref false in
766
767 let rec scan_line () =
768 match Input.peek t.input with
769 | None -> ()
770 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow ->
771 (* Blank character within a line - save to whitespace buffer *)
772 Buffer.add_char whitespace c;
773 ignore (Input.next t.input);
774 scan_line ()
775 | Some c when can_continue_plain t c ~in_flow ->
776 (* Non-blank character - process any pending breaks/whitespace first *)
777 begin
778 if Buffer.length spaces > 0 then begin
779 if !leading_blanks then begin
780 (* Fold line break *)
781 if Buffer.contents spaces = "\n" then Buffer.add_char buf ' '
782 else begin
783 (* Multiple breaks - preserve all but first *)
784 let s = Buffer.contents spaces in
785 Buffer.add_substring buf s 1 (String.length s - 1)
786 end
787 end
788 else Buffer.add_buffer buf spaces;
789 Buffer.clear spaces
790 end;
791 (* Add any pending whitespace from within the line *)
792 if Buffer.length whitespace > 0 then begin
793 Buffer.add_buffer buf whitespace;
794 Buffer.clear whitespace
795 end;
796 (* Add the character *)
797 Buffer.add_char buf c;
798 ignore (Input.next t.input);
799 leading_blanks := false;
800 scan_line ()
801 end
802 | _ -> ()
803 in
804
805 let rec scan_lines () =
806 scan_line ();
807 (* Check for line continuation *)
808 if Input.next_is_break t.input then begin
809 (* Discard any trailing whitespace from the current line *)
810 Buffer.clear whitespace;
811 (* Save the line break *)
812 if !leading_blanks then begin
813 (* We already had a break - this is an additional break (empty line) *)
814 Buffer.add_char spaces '\n'
815 end
816 else begin
817 (* First line break *)
818 Buffer.clear spaces;
819 Buffer.add_char spaces '\n';
820 leading_blanks := true
821 end;
822 Input.consume_break t.input;
823 (* Note: We do NOT set allow_simple_key here during plain scalar scanning.
824 Setting it here would incorrectly allow ':' that appears on a continuation
825 line to become a mapping indicator. The flag will be set properly after
826 the scalar ends and skip_to_next_token processes line breaks. *)
827 (* Skip leading blanks on the next line *)
828 while Input.next_is_blank t.input do
829 ignore (Input.next t.input)
830 done;
831 let col = (Input.position t.input).column in
832 (* Check indentation - stop if we're at or before the containing block's indent *)
833 (* However, allow empty lines (line breaks) to continue even if dedented *)
834 if Input.next_is_break t.input then
835 scan_lines () (* Empty line - continue *)
836 else if (not in_flow) && col <= indent then ()
837 (* Stop - dedented or at parent level in block context *)
838 else if Input.at_document_boundary t.input then ()
839 (* Stop - document boundary *)
840 else scan_lines ()
841 end
842 in
843
844 scan_lines ();
845 let value = Buffer.contents buf in
846 (* Trim trailing whitespace (spaces and tabs) *)
847 let value =
848 let len = String.length value in
849 let rec find_end i =
850 if i < 0 then 0
851 else match value.[i] with ' ' | '\t' -> find_end (i - 1) | _ -> i + 1
852 in
853 let end_pos = find_end (len - 1) in
854 String.sub value 0 end_pos
855 in
856 let span = Span.make ~start ~stop:(Input.mark t.input) in
857 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *)
858 (value, span, !leading_blanks)
859
860(** Scan block scalar (literal | or folded >) *)
861let scan_block_scalar t literal =
862 let start = Input.mark t.input in
863 ignore (Input.next t.input);
864
865 (* consume | or > *)
866
867 (* Parse header: optional indentation indicator and chomping *)
868 let explicit_indent = ref None in
869 let chomping = ref Chomping.Clip in
870
871 (* First character of header *)
872 (match Input.peek t.input with
873 | Some c when Input.is_digit c && c <> '0' ->
874 explicit_indent := Some (Char.code c - Char.code '0');
875 ignore (Input.next t.input)
876 | Some '-' ->
877 chomping := Chomping.Strip;
878 ignore (Input.next t.input)
879 | Some '+' ->
880 chomping := Chomping.Keep;
881 ignore (Input.next t.input)
882 | _ -> ());
883
884 (* Second character of header *)
885 (match Input.peek t.input with
886 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None ->
887 explicit_indent := Some (Char.code c - Char.code '0');
888 ignore (Input.next t.input)
889 | Some '-' when !chomping = Chomping.Clip ->
890 chomping := Chomping.Strip;
891 ignore (Input.next t.input)
892 | Some '+' when !chomping = Chomping.Clip ->
893 chomping := Chomping.Keep;
894 ignore (Input.next t.input)
895 | _ -> ());
896
897 (* Skip whitespace and optional comment *)
898 skip_whitespace_and_comment t;
899
900 (* Consume line break *)
901 if Input.next_is_break t.input then Input.consume_break t.input
902 else if not (Input.is_eof t.input) then
903 Error.raise_at (Input.mark t.input)
904 (Invalid_block_scalar_header "expected newline after header");
905
906 let base_indent = current_indent t in
907 (* base_indent is the indent level from the stack, -1 if empty.
908 It's used directly for comparisons in implicit indent case. *)
909 let content_indent =
910 ref
911 (match !explicit_indent with
912 | Some n ->
913 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed.
914 content_indent = (base_indent - 1) + n, but at least n for document level. *)
915 let base_level = max 0 (base_indent - 1) in
916 base_level + n
917 | None -> 0 (* Will be determined by first non-empty line *))
918 in
919
920 let buf = Buffer.create 256 in
921 let trailing_breaks = Buffer.create 16 in
922 let leading_blank = ref false in
923 (* Was the previous line "more indented"? *)
924 let max_empty_line_indent = ref 0 in
925 (* Track max indent of empty lines before first content *)
926
927 (* Skip to content indentation, skipping empty lines.
928 Returns the number of spaces actually skipped (important for detecting dedentation). *)
929 let rec skip_to_content_indent () =
930 if !content_indent > 0 then begin
931 (* Explicit indent - skip up to content_indent spaces *)
932 let spaces_skipped = ref 0 in
933 while
934 !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input
935 do
936 incr spaces_skipped;
937 ignore (Input.next t.input)
938 done;
939
940 (* Check if this line is empty (only spaces/tabs until break/eof) *)
941 if Input.next_is_break t.input then begin
942 (* Empty line - record the break and continue *)
943 Buffer.add_char trailing_breaks '\n';
944 Input.consume_break t.input;
945 skip_to_content_indent ()
946 end
947 else if !spaces_skipped < !content_indent then begin
948 (* Line starts with fewer spaces than content_indent - dedented *)
949 !spaces_skipped
950 end
951 else if Input.next_is_blank t.input then begin
952 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line.
953 For literal scalars, whitespace-only lines ARE content (not empty).
954 For folded scalars, whitespace-only lines that are "more indented" are preserved. *)
955 if literal then
956 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *)
957 !content_indent
958 else begin
959 (* Folded: check if rest is only blanks *)
960 let idx = ref 0 in
961 while
962 match Input.peek_nth t.input !idx with
963 | Some c when Input.is_blank c ->
964 incr idx;
965 true
966 | _ -> false
967 do
968 ()
969 done;
970 match Input.peek_nth t.input !idx with
971 | None | Some '\n' | Some '\r' ->
972 (* Empty/whitespace-only line in folded - skip spaces *)
973 while Input.next_is_blank t.input do
974 ignore (Input.next t.input)
975 done;
976 Buffer.add_char trailing_breaks '\n';
977 Input.consume_break t.input;
978 skip_to_content_indent ()
979 | _ ->
980 (* Has non-whitespace content *)
981 !content_indent
982 end
983 end
984 else !content_indent
985 end
986 else begin
987 (* Implicit indent - skip empty lines without consuming spaces.
988 Note: Only SPACES count as indentation. Tabs are content, not indentation.
989 So we only check for spaces when determining if a line is "empty". *)
990 if Input.next_is_break t.input then begin
991 Buffer.add_char trailing_breaks '\n';
992 Input.consume_break t.input;
993 skip_to_content_indent ()
994 end
995 else if Input.next_is (( = ) ' ') t.input then begin
996 (* Check if line is empty (only spaces before break) *)
997 let idx = ref 0 in
998 while
999 match Input.peek_nth t.input !idx with
1000 | Some ' ' ->
1001 incr idx;
1002 true
1003 | _ -> false
1004 do
1005 ()
1006 done;
1007 match Input.peek_nth t.input !idx with
1008 | None | Some '\n' | Some '\r' ->
1009 (* Line has only spaces - empty line *)
1010 (* Track max indent of empty lines for later validation *)
1011 if !idx > !max_empty_line_indent then max_empty_line_indent := !idx;
1012 while Input.next_is (( = ) ' ') t.input do
1013 ignore (Input.next t.input)
1014 done;
1015 Buffer.add_char trailing_breaks '\n';
1016 Input.consume_break t.input;
1017 skip_to_content_indent ()
1018 | _ ->
1019 (* Has content (including tabs which are content, not indentation) *)
1020 0
1021 end
1022 else if Input.next_is (( = ) '\t') t.input then begin
1023 (* Tab at start of line in implicit indent mode - this is an error (Y79Y)
1024 because tabs cannot be used as indentation in YAML *)
1025 Error.raise_at (Input.mark t.input) Tab_in_indentation
1026 end
1027 else
1028 (* Not at break or space - other content character *)
1029 0
1030 end
1031 in
1032
1033 (* Read content *)
1034 let rec read_lines () =
1035 let spaces_skipped = skip_to_content_indent () in
1036
1037 (* Check if we're at content *)
1038 if Input.is_eof t.input then ()
1039 else if Input.at_document_boundary t.input then ()
1040 else begin
1041 (* Count additional leading spaces beyond what was skipped *)
1042 let extra_spaces = ref 0 in
1043 while Input.next_is (( = ) ' ') t.input do
1044 incr extra_spaces;
1045 ignore (Input.next t.input)
1046 done;
1047
1048 (* Calculate actual line indentation *)
1049 let line_indent = spaces_skipped + !extra_spaces in
1050
1051 (* Determine content indent from first content line (implicit case) *)
1052 let first_line = !content_indent = 0 in
1053 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent.
1054 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *)
1055 let base_level = base_indent - 1 in
1056 let should_process =
1057 if !content_indent = 0 then begin
1058 (* For implicit indent, content must be more indented than base_level. *)
1059 if line_indent <= base_level then false
1060 (* No content - first line not indented enough *)
1061 else begin
1062 (* Validate: first content line must be indented at least as much as
1063 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *)
1064 if line_indent < !max_empty_line_indent && line_indent > base_level
1065 then
1066 Error.raise_at (Input.mark t.input)
1067 (Invalid_block_scalar_header
1068 "wrongly indented line in block scalar");
1069 content_indent := line_indent;
1070 true
1071 end
1072 end
1073 else if line_indent < !content_indent then false
1074 (* Dedented - done with content *)
1075 else true
1076 in
1077
1078 if should_process then begin
1079 (* Check if current line is "more indented" (has extra indent or starts with whitespace).
1080 For folded scalars, lines that start with any whitespace (space or tab) after the
1081 content indentation are "more indented" and preserve breaks.
1082 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *)
1083 let trailing_blank =
1084 line_indent > !content_indent || Input.next_is_blank t.input
1085 in
1086
1087 (* Add trailing breaks to buffer *)
1088 if Buffer.length buf > 0 then begin
1089 if Buffer.length trailing_breaks > 0 then begin
1090 if literal then Buffer.add_buffer buf trailing_breaks
1091 else begin
1092 (* Folded scalar: fold only if both previous and current lines are not more-indented *)
1093 if (not !leading_blank) && not trailing_blank then begin
1094 let breaks = Buffer.contents trailing_breaks in
1095 if String.length breaks = 1 then Buffer.add_char buf ' '
1096 else Buffer.add_substring buf breaks 1 (String.length breaks - 1)
1097 end
1098 else begin
1099 (* Preserve breaks for more-indented lines *)
1100 Buffer.add_buffer buf trailing_breaks
1101 end
1102 end
1103 end
1104 else if not literal then Buffer.add_char buf ' '
1105 end
1106 else Buffer.add_buffer buf trailing_breaks;
1107 Buffer.clear trailing_breaks;
1108
1109 (* Add extra indentation for literal or more-indented folded lines *)
1110 (* On the first line (when determining content_indent), we've already consumed all spaces,
1111 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *)
1112 if (not first_line) && (literal || (!extra_spaces > 0 && not literal))
1113 then begin
1114 for _ = 1 to !extra_spaces do
1115 Buffer.add_char buf ' '
1116 done
1117 end;
1118
1119 (* Read line content *)
1120 while
1121 (not (Input.is_eof t.input)) && not (Input.next_is_break t.input)
1122 do
1123 Buffer.add_char buf (Input.next_exn t.input)
1124 done;
1125
1126 (* Record trailing break *)
1127 if Input.next_is_break t.input then begin
1128 Buffer.add_char trailing_breaks '\n';
1129 Input.consume_break t.input
1130 end;
1131
1132 (* Update leading_blank for next iteration *)
1133 leading_blank := trailing_blank;
1134
1135 read_lines ()
1136 end
1137 end
1138 in
1139
1140 read_lines ();
1141
1142 (* Apply chomping *)
1143 let value =
1144 let content = Buffer.contents buf in
1145 match !chomping with
1146 | Chomping.Strip -> content
1147 | Chomping.Clip ->
1148 if String.length content > 0 then content ^ "\n" else content
1149 | Chomping.Keep -> content ^ Buffer.contents trailing_breaks
1150 in
1151
1152 let span = Span.make ~start ~stop:(Input.mark t.input) in
1153 let style = if literal then `Literal else `Folded in
1154 (value, style, span)
1155
1156(** Scan directive (after %) *)
1157let scan_directive t =
1158 let start = Input.mark t.input in
1159 ignore (Input.next t.input);
1160
1161 (* consume % *)
1162
1163 (* Read directive name *)
1164 let name_buf = Buffer.create 16 in
1165 while
1166 match Input.peek t.input with
1167 | Some c when Input.is_alnum c || c = '-' ->
1168 Buffer.add_char name_buf c;
1169 ignore (Input.next t.input);
1170 true
1171 | _ -> false
1172 do
1173 ()
1174 done;
1175 let name = Buffer.contents name_buf in
1176
1177 (* Skip blanks *)
1178 while Input.next_is_blank t.input do
1179 ignore (Input.next t.input)
1180 done;
1181
1182 match name with
1183 | "YAML" ->
1184 (* Version directive: %YAML 1.2 *)
1185 let major = ref 0 in
1186 let minor = ref 0 in
1187 (* Read major version *)
1188 while Input.next_is_digit t.input do
1189 major :=
1190 (!major * 10) + (Char.code (Input.next_exn t.input) - Char.code '0')
1191 done;
1192 (* Expect . *)
1193 (match Input.peek t.input with
1194 | Some '.' -> ignore (Input.next t.input)
1195 | _ ->
1196 Error.raise_at (Input.mark t.input)
1197 (Invalid_yaml_version "expected '.'"));
1198 (* Read minor version *)
1199 while Input.next_is_digit t.input do
1200 minor :=
1201 (!minor * 10) + (Char.code (Input.next_exn t.input) - Char.code '0')
1202 done;
1203 (* Validate: only whitespace and comments allowed before line break (MUS6) *)
1204 skip_whitespace_and_comment t;
1205 if (not (Input.next_is_break t.input)) && not (Input.is_eof t.input) then
1206 Error.raise_at (Input.mark t.input)
1207 (Invalid_directive "expected comment or line break after version");
1208 let span = Span.make ~start ~stop:(Input.mark t.input) in
1209 (Token.Version_directive { major = !major; minor = !minor }, span)
1210 | "TAG" ->
1211 (* Tag directive: %TAG !foo! tag:example.com,2000: *)
1212 let handle = scan_tag_handle t in
1213 (* Skip blanks *)
1214 while Input.next_is_blank t.input do
1215 ignore (Input.next t.input)
1216 done;
1217 (* Read prefix *)
1218 let prefix_buf = Buffer.create 32 in
1219 while
1220 match Input.peek t.input with
1221 | Some c when not (Input.is_whitespace c) ->
1222 Buffer.add_char prefix_buf c;
1223 ignore (Input.next t.input);
1224 true
1225 | _ -> false
1226 do
1227 ()
1228 done;
1229 let prefix = Buffer.contents prefix_buf in
1230 let span = Span.make ~start ~stop:(Input.mark t.input) in
1231 (Token.Tag_directive { handle; prefix }, span)
1232 | _ ->
1233 (* Reserved/Unknown directive - skip to end of line and ignore *)
1234 (* Per YAML spec, reserved directives should be ignored with a warning *)
1235 while (not (Input.is_eof t.input)) && not (Input.next_is_break t.input) do
1236 ignore (Input.next t.input)
1237 done;
1238 let span = Span.make ~start ~stop:(Input.mark t.input) in
1239 (* Return an empty tag directive token to indicate directive was processed but ignored *)
1240 (Token.Tag_directive { handle = ""; prefix = "" }, span)
1241
1242(** Fetch the next token(s) into the queue *)
1243let rec fetch_next_token t =
1244 skip_to_next_token t;
1245 stale_simple_keys t;
1246 let col = column t in
1247 (* Unroll indents that are deeper than current column.
1248 Note: we use col, not col-1, to allow entries at the same level. *)
1249 unroll_indent t col;
1250
1251 (* We're about to process actual content, not leading whitespace *)
1252 t.leading_whitespace <- false;
1253
1254 if Input.is_eof t.input then fetch_stream_end t
1255 else if Input.at_document_boundary t.input then fetch_document_indicator t
1256 else begin
1257 match Input.peek t.input with
1258 | None -> fetch_stream_end t
1259 | Some '%' when (Input.position t.input).column = 1 -> fetch_directive t
1260 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start
1261 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start
1262 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end
1263 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end
1264 | Some ',' -> fetch_flow_entry t
1265 | Some '-' when t.flow_level = 0 && check_block_entry t ->
1266 fetch_block_entry t
1267 | Some '?' when check_key t -> fetch_key t
1268 | Some ':' when check_value t -> fetch_value t
1269 | Some '*' -> fetch_alias t
1270 | Some '&' -> fetch_anchor t
1271 | Some '!' -> fetch_tag t
1272 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true
1273 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false
1274 | Some '\'' -> fetch_single_quoted t
1275 | Some '"' -> fetch_double_quoted t
1276 | Some '-' when can_start_plain t -> fetch_plain_scalar t
1277 | Some '?' when can_start_plain t -> fetch_plain_scalar t
1278 | Some ':' when can_start_plain t -> fetch_plain_scalar t
1279 | Some c when can_start_plain_char c t -> fetch_plain_scalar t
1280 | Some c -> Error.raise_at (Input.mark t.input) (Unexpected_character c)
1281 end
1282
1283and fetch_stream_end t =
1284 if not t.stream_ended then begin
1285 unroll_indent t (-1);
1286 remove_simple_key t;
1287 t.allow_simple_key <- false;
1288 t.stream_ended <- true;
1289 let span = Span.point (Input.mark t.input) in
1290 emit t span Token.Stream_end
1291 end
1292
1293and fetch_document_indicator t =
1294 unroll_indent t (-1);
1295 remove_simple_key t;
1296 t.allow_simple_key <- false;
1297 let start = Input.mark t.input in
1298 let indicator = Input.peek_string t.input 3 in
1299 Input.skip t.input 3;
1300 let span = Span.make ~start ~stop:(Input.mark t.input) in
1301 let token =
1302 if indicator = "---" then Token.Document_start else Token.Document_end
1303 in
1304 (* Reset document content flag after document end marker *)
1305 if indicator = "..." then begin
1306 t.document_has_content <- false;
1307 (* After document end marker, skip whitespace and check for end of line or comment *)
1308 while Input.next_is_blank t.input do
1309 ignore (Input.next t.input)
1310 done;
1311 match Input.peek t.input with
1312 | None -> () (* EOF is ok *)
1313 | Some c when Input.is_break c -> ()
1314 | Some '#' -> () (* Comment is ok *)
1315 | _ ->
1316 Error.raise_at start
1317 (Invalid_directive
1318 "content not allowed after document end marker on same line")
1319 end;
1320 emit t span token
1321
1322and fetch_directive t =
1323 (* Directives can only appear:
1324 1. At stream start (before any document content)
1325 2. After a document end marker (...)
1326 If we've emitted content in the current document, we need a document end marker first *)
1327 if t.document_has_content then
1328 Error.raise_at (Input.mark t.input)
1329 (Unexpected_token
1330 "directives must be separated from document content by document end \
1331 marker (...)");
1332 unroll_indent t (-1);
1333 remove_simple_key t;
1334 t.allow_simple_key <- false;
1335 let token, span = scan_directive t in
1336 emit t span token
1337
1338and fetch_flow_collection_start t token_type =
1339 save_simple_key t;
1340 (* Record indent of outermost flow collection *)
1341 if t.flow_level = 0 then t.flow_indent <- column t;
1342 t.flow_level <- t.flow_level + 1;
1343 (* Track whether this is a mapping or sequence *)
1344 let is_mapping = token_type = Token.Flow_mapping_start in
1345 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack;
1346 t.allow_simple_key <- true;
1347 t.simple_keys <- None :: t.simple_keys;
1348 t.document_has_content <- true;
1349 let start = Input.mark t.input in
1350 ignore (Input.next t.input);
1351 let span = Span.make ~start ~stop:(Input.mark t.input) in
1352 emit t span token_type
1353
1354and fetch_flow_collection_end t token_type =
1355 remove_simple_key t;
1356 t.flow_level <- t.flow_level - 1;
1357 t.flow_mapping_stack <-
1358 (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []);
1359 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []);
1360 t.allow_simple_key <- false;
1361 let start = Input.mark t.input in
1362 ignore (Input.next t.input);
1363 (* Allow adjacent values after flow collection ends *)
1364 if t.flow_level > 0 then
1365 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1366 let span = Span.make ~start ~stop:(Input.mark t.input) in
1367 emit t span token_type
1368
1369and fetch_flow_entry t =
1370 remove_simple_key t;
1371 t.allow_simple_key <- true;
1372 let start = Input.mark t.input in
1373 ignore (Input.next t.input);
1374 let span = Span.make ~start ~stop:(Input.mark t.input) in
1375 emit t span Token.Flow_entry
1376
1377and check_block_entry t =
1378 (* - followed by whitespace or EOF *)
1379 match Input.peek_nth t.input 1 with
1380 | None -> true
1381 | Some c -> Input.is_whitespace c
1382
1383and fetch_block_entry t =
1384 if t.flow_level = 0 then begin
1385 (* Block entries require allow_simple_key to be true.
1386 This prevents block sequences on the same line as a mapping value,
1387 e.g., "key: - a" is invalid. *)
1388 if not t.allow_simple_key then
1389 Error.raise_at (Input.mark t.input) Block_sequence_disallowed;
1390 let col = column t in
1391 if roll_indent t col then begin
1392 let span = Span.point (Input.mark t.input) in
1393 emit t span Token.Block_sequence_start
1394 end
1395 end;
1396 remove_simple_key t;
1397 t.allow_simple_key <- true;
1398 t.document_has_content <- true;
1399 let start = Input.mark t.input in
1400 ignore (Input.next t.input);
1401
1402 (* Check for tabs after - : pattern like -\t- is invalid *)
1403 let found_tabs, _found_spaces = skip_blanks_check_tabs t in
1404 if found_tabs then begin
1405 (* If we found tabs and next char is - followed by whitespace, error *)
1406 match Input.peek t.input with
1407 | Some '-' -> (
1408 match Input.peek_nth t.input 1 with
1409 | None -> Error.raise_at start Tab_in_indentation
1410 | Some c when Input.is_whitespace c ->
1411 Error.raise_at start Tab_in_indentation
1412 | Some _ -> ())
1413 | _ -> ()
1414 end;
1415
1416 let span = Span.make ~start ~stop:(Input.mark t.input) in
1417 emit t span Token.Block_entry
1418
1419and check_key t =
1420 (* ? followed by whitespace or flow indicator in both block and flow *)
1421 match Input.peek_nth t.input 1 with
1422 | None -> true
1423 | Some c ->
1424 Input.is_whitespace c || (t.flow_level > 0 && Input.is_flow_indicator c)
1425
1426and fetch_key t =
1427 if t.flow_level = 0 then begin
1428 if not t.allow_simple_key then
1429 Error.raise_at (Input.mark t.input) Expected_key;
1430 let col = column t in
1431 if roll_indent t col then begin
1432 let span = Span.point (Input.mark t.input) in
1433 emit t span Token.Block_mapping_start
1434 end
1435 end;
1436 remove_simple_key t;
1437 t.allow_simple_key <- t.flow_level = 0;
1438 t.document_has_content <- true;
1439 let start = Input.mark t.input in
1440 ignore (Input.next t.input);
1441
1442 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *)
1443 let found_tabs, _found_spaces = skip_blanks_check_tabs t in
1444 if found_tabs && t.flow_level = 0 then begin
1445 (* In block context, tabs after ? are not allowed *)
1446 Error.raise_at start Tab_in_indentation
1447 end;
1448
1449 let span = Span.make ~start ~stop:(Input.mark t.input) in
1450 emit t span Token.Key
1451
1452and check_value t =
1453 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *)
1454 match Input.peek_nth t.input 1 with
1455 | None -> true
1456 | Some c -> (
1457 Input.is_whitespace c
1458 || (t.flow_level > 0 && Input.is_flow_indicator c)
1459 ||
1460 (* Allow adjacent values in flow context at designated positions *)
1461 t.flow_level > 0
1462 &&
1463 match t.adjacent_value_allowed_at with
1464 | Some pos ->
1465 pos.Position.line = (Input.position t.input).Position.line
1466 && pos.Position.column = (Input.position t.input).Position.column
1467 | None -> false)
1468
1469and fetch_value t =
1470 let start = Input.mark t.input in
1471 (* Check for simple key *)
1472 let used_simple_key =
1473 match t.simple_keys with
1474 | Some sk :: _ when sk.sk_possible ->
1475 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line.
1476 In explicit flow mapping { }, key and : can span lines. *)
1477 let is_implicit_flow_mapping =
1478 match t.flow_mapping_stack with
1479 | false :: _ ->
1480 true (* false = we're in a sequence, so any mapping is implicit *)
1481 | _ -> false
1482 in
1483 if
1484 is_implicit_flow_mapping
1485 && sk.sk_position.line < (Input.position t.input).line
1486 then Error.raise_at start Illegal_flow_key_line;
1487 (* Insert KEY token before the simple key value *)
1488 let key_span = Span.point sk.sk_position in
1489 let key_token = { Token.token = Token.Key; span = key_span } in
1490 (* We need to insert at the right position *)
1491 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1492 Queue.clear t.tokens;
1493 let insert_pos = sk.sk_token_number - t.tokens_taken in
1494 Array.iteri
1495 (fun i tok ->
1496 if i = insert_pos then Queue.add key_token t.tokens;
1497 Queue.add tok t.tokens)
1498 tokens;
1499 if insert_pos >= Array.length tokens then Queue.add key_token t.tokens;
1500 t.token_number <- t.token_number + 1;
1501 (* Roll indent for implicit block mapping *)
1502 if t.flow_level = 0 then begin
1503 let col = sk.sk_position.column in
1504 if roll_indent t col then begin
1505 let span = Span.point sk.sk_position in
1506 (* Insert block mapping start before key *)
1507 let bm_token = { Token.token = Token.Block_mapping_start; span } in
1508 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1509 Queue.clear t.tokens;
1510 Array.iteri
1511 (fun i tok ->
1512 if i = insert_pos then Queue.add bm_token t.tokens;
1513 Queue.add tok t.tokens)
1514 tokens;
1515 if insert_pos >= Array.length tokens then
1516 Queue.add bm_token t.tokens;
1517 t.token_number <- t.token_number + 1
1518 end
1519 end;
1520 t.simple_keys <- None :: List.tl t.simple_keys;
1521 true
1522 | _ ->
1523 (* No simple key - this is a complex value (or empty key) *)
1524 if t.flow_level = 0 then begin
1525 if not t.allow_simple_key then
1526 Error.raise_at (Input.mark t.input) Expected_key;
1527 let col = column t in
1528 if roll_indent t col then begin
1529 let span = Span.point (Input.mark t.input) in
1530 emit t span Token.Block_mapping_start
1531 end
1532 (* Note: We don't emit KEY here. Empty key handling is done by the parser,
1533 which emits empty scalar when it sees VALUE without preceding KEY. *)
1534 end;
1535 false
1536 in
1537 remove_simple_key t;
1538 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white"
1539 In flow context or after using a simple key, disallow simple keys *)
1540 t.allow_simple_key <- (not used_simple_key) && t.flow_level = 0;
1541 t.document_has_content <- true;
1542 let start = Input.mark t.input in
1543 ignore (Input.next t.input);
1544
1545 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09)
1546 However, :\t bar (tab followed by space then content) is valid (6BCT) *)
1547 let found_tabs, found_spaces = skip_blanks_check_tabs t in
1548 if found_tabs && (not found_spaces) && t.flow_level = 0 then begin
1549 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *)
1550 match Input.peek t.input with
1551 | Some ('-' | '?') -> Error.raise_at start Tab_in_indentation
1552 | Some c
1553 when (c >= 'a' && c <= 'z')
1554 || (c >= 'A' && c <= 'Z')
1555 || (c >= '0' && c <= '9') ->
1556 (* Tab-only followed by alphanumeric - likely a key, which is invalid *)
1557 Error.raise_at start Tab_in_indentation
1558 | _ -> ()
1559 end;
1560
1561 (* Skip any comment that may follow the colon and whitespace *)
1562 skip_whitespace_and_comment t;
1563
1564 let span = Span.make ~start ~stop:(Input.mark t.input) in
1565 emit t span Token.Value
1566
1567and fetch_anchor_or_alias t ~is_alias =
1568 save_simple_key t;
1569 t.allow_simple_key <- false;
1570 t.document_has_content <- true;
1571 let start = Input.mark t.input in
1572 ignore (Input.next t.input);
1573 (* consume * or & *)
1574 let name, span = scan_anchor_alias t in
1575 let span = Span.make ~start ~stop:span.stop in
1576 let token = if is_alias then Token.Alias name else Token.Anchor name in
1577 emit t span token
1578
1579and fetch_alias t = fetch_anchor_or_alias t ~is_alias:true
1580and fetch_anchor t = fetch_anchor_or_alias t ~is_alias:false
1581
1582and fetch_tag t =
1583 save_simple_key t;
1584 t.allow_simple_key <- false;
1585 t.document_has_content <- true;
1586 let handle, suffix, span = scan_tag t in
1587 emit t span (Token.Tag { handle; suffix })
1588
1589and fetch_block_scalar t literal =
1590 remove_simple_key t;
1591 t.allow_simple_key <- true;
1592 t.document_has_content <- true;
1593 let value, style, span = scan_block_scalar t literal in
1594 emit t span (Token.Scalar { style; value })
1595
1596and fetch_quoted t ~double =
1597 save_simple_key t;
1598 t.allow_simple_key <- false;
1599 t.document_has_content <- true;
1600 let value, span =
1601 if double then scan_double_quoted t else scan_single_quoted t
1602 in
1603 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
1604 skip_to_next_token t;
1605 if t.flow_level > 0 then
1606 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1607 let style = if double then `Double_quoted else `Single_quoted in
1608 emit t span (Token.Scalar { style; value })
1609
1610and fetch_single_quoted t = fetch_quoted t ~double:false
1611and fetch_double_quoted t = fetch_quoted t ~double:true
1612
1613and can_start_plain t =
1614 (* Check if - ? : can start a plain scalar *)
1615 match Input.peek_nth t.input 1 with
1616 | None -> false
1617 | Some c ->
1618 (not (Input.is_whitespace c))
1619 && (t.flow_level = 0 || not (Input.is_flow_indicator c))
1620
1621and can_start_plain_char c _t =
1622 (* Characters that can start a plain scalar *)
1623 if Input.is_whitespace c then false
1624 else if Input.is_indicator c then false
1625 else true
1626
1627and fetch_plain_scalar t =
1628 save_simple_key t;
1629 t.allow_simple_key <- false;
1630 t.document_has_content <- true;
1631 let value, span, ended_with_linebreak = scan_plain_scalar t in
1632 (* If the plain scalar ended after crossing a line break (leading_blanks = true),
1633 allow simple keys. This is important because the scanner already consumed the
1634 line break and leading whitespace when checking for continuation. *)
1635 if ended_with_linebreak then t.allow_simple_key <- true;
1636 emit t span (Token.Scalar { style = `Plain; value })
1637
1638(** Check if we need more tokens to resolve simple keys *)
1639let need_more_tokens t =
1640 if t.stream_ended then false
1641 else if Queue.is_empty t.tokens then true
1642 else
1643 (* Check if any simple key could affect the first queued token *)
1644 List.exists
1645 (function
1646 | Some sk when sk.sk_possible -> sk.sk_token_number >= t.tokens_taken
1647 | _ -> false)
1648 t.simple_keys
1649
1650(** Ensure we have enough tokens to return one safely *)
1651let ensure_tokens t =
1652 if not t.stream_started then begin
1653 t.stream_started <- true;
1654 let span = Span.point (Input.position t.input) in
1655 let encoding, _ = Encoding.detect (Input.source t.input) in
1656 emit t span (Token.Stream_start encoding)
1657 end;
1658 while need_more_tokens t do
1659 fetch_next_token t
1660 done
1661
1662(** Get next token *)
1663let next t =
1664 ensure_tokens t;
1665 if Queue.is_empty t.tokens then None
1666 else begin
1667 t.tokens_taken <- t.tokens_taken + 1;
1668 Some (Queue.pop t.tokens)
1669 end
1670
1671(** Peek at next token *)
1672let peek t =
1673 ensure_tokens t;
1674 Queue.peek_opt t.tokens
1675
1676(** Iterate over all tokens *)
1677let iter f t =
1678 let rec loop () =
1679 match next t with
1680 | None -> ()
1681 | Some tok ->
1682 f tok;
1683 loop ()
1684 in
1685 loop ()
1686
1687(** Fold over all tokens *)
1688let fold f init t =
1689 let rec loop acc =
1690 match next t with None -> acc | Some tok -> loop (f acc tok)
1691 in
1692 loop init
1693
1694(** Convert to list *)
1695let to_list t = fold (fun acc tok -> tok :: acc) [] t |> List.rev