Pure OCaml Yaml 1.2 reader and writer using Bytesrw
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** YAML tokenizer/scanner with lookahead for ambiguity resolution *)
7
8(** Simple key tracking for mapping key disambiguation *)
9type simple_key = {
10 sk_possible : bool;
11 sk_required : bool;
12 sk_token_number : int;
13 sk_position : Position.t;
14}
15
16(** Indent level tracking *)
17type indent = {
18 indent : int;
19 needs_block_end : bool;
20}
21
22type t = {
23 input : Input.t;
24 tokens : Token.spanned Queue.t;
25 mutable token_number : int;
26 mutable tokens_taken : int;
27 mutable stream_started : bool;
28 mutable stream_ended : bool;
29 mutable indent_stack : indent list; (** Stack of indentation levels *)
30 mutable flow_level : int; (** Nesting depth in \[\] or \{\} *)
31 mutable flow_indent : int; (** Column where outermost flow collection started *)
32 mutable simple_keys : simple_key option list; (** Per flow-level simple key tracking *)
33 mutable allow_simple_key : bool;
34 mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *)
35 mutable document_has_content : bool; (** True if we've emitted content tokens in current document *)
36 mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *)
37 mutable flow_mapping_stack : bool list; (** Stack of whether each flow level is a mapping *)
38}
39
40let create input =
41 {
42 input;
43 tokens = Queue.create ();
44 token_number = 0;
45 tokens_taken = 0;
46 stream_started = false;
47 stream_ended = false;
48 indent_stack = [];
49 flow_level = 0;
50 flow_indent = 0;
51 simple_keys = [None]; (* One entry for the base level *)
52 allow_simple_key = true;
53 leading_whitespace = true; (* Start at beginning of stream *)
54 document_has_content = false;
55 adjacent_value_allowed_at = None;
56 flow_mapping_stack = [];
57 }
58
59let of_string s = create (Input.of_string s)
60let of_input = create
61let of_reader r = create (Input.of_reader r)
62
63let position t = Input.position t.input
64
65(** Add a token to the queue *)
66let emit t span token =
67 Queue.add { Token.token; span } t.tokens;
68 t.token_number <- t.token_number + 1
69
70(** Get current column (1-indexed) *)
71let column t = (Input.position t.input).column
72
73(** Get current indent level *)
74let current_indent t =
75 match t.indent_stack with
76 | [] -> -1
77 | { indent; _ } :: _ -> indent
78
79(** Skip whitespace to end of line, checking for valid comments.
80 Returns true if any whitespace (including tabs) was found before a comment. *)
81let skip_whitespace_and_comment t =
82 let has_whitespace = ref false in
83 (* Skip blanks (spaces and tabs) *)
84 while Input.next_is_blank t.input do
85 has_whitespace := true;
86 ignore (Input.next t.input)
87 done;
88 (* Check for comment *)
89 if Input.next_is (( = ) '#') t.input then begin
90 (* Validate: comment must be preceded by whitespace or be at start of line *)
91 if not !has_whitespace then begin
92 (* Check if we're at the start of input or after whitespace (blank or line break) *)
93 match Input.peek_back t.input with
94 | None -> () (* Start of input - OK *)
95 | Some c when Input.is_whitespace c -> () (* After whitespace - OK *)
96 | _ ->
97 (* Comment not preceded by whitespace - ERROR *)
98 Error.raise_at (Input.mark t.input) Invalid_comment
99 end;
100 (* Skip to end of line *)
101 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
102 ignore (Input.next t.input)
103 done
104 end
105
106(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *)
107let skip_blanks_check_tabs t =
108 let found_tab = ref false in
109 let found_space = ref false in
110 while Input.next_is_blank t.input do
111 (match Input.peek t.input with
112 | Some '\t' -> found_tab := true
113 | Some ' ' -> found_space := true
114 | _ -> ());
115 ignore (Input.next t.input)
116 done;
117 (!found_tab, !found_space)
118
119(** Skip whitespace and comments, return true if at newline *)
120let rec skip_to_next_token t =
121 (* Check for tabs used as indentation in block context *)
122 (match Input.peek t.input with
123 | Some '\t' when t.flow_level = 0 && t.leading_whitespace &&
124 (column t - 1) < current_indent t ->
125 (* Tab found in indentation zone - this is invalid *)
126 (* Skip to end of line to check if line has content *)
127 let start_pos = Input.mark t.input in
128 while Input.next_is_blank t.input do
129 ignore (Input.next t.input)
130 done;
131 (* If we have content on this line with a tab, raise error *)
132 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
133 Error.raise_at start_pos Tab_in_indentation
134 | _ -> ());
135
136 (* Skip blanks and validate comments *)
137 skip_whitespace_and_comment t;
138 (* Skip line break in block context *)
139 if t.flow_level = 0 && Input.next_is_break t.input then begin
140 Input.consume_break t.input;
141 t.allow_simple_key <- true;
142 t.leading_whitespace <- true;
143 skip_to_next_token t
144 end
145 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
146 (* In flow context, skip all whitespace including line breaks *)
147 if Input.next_is_break t.input then begin
148 Input.consume_break t.input;
149 (* Allow simple keys after line breaks in flow context *)
150 t.allow_simple_key <- true;
151 (* After line break in flow, check for tabs at start of line (Y79Y/03)
152 Tabs are not allowed as indentation - if tab is first char and results
153 in a column less than flow_indent, it's an error *)
154 if Input.next_is (( = ) '\t') t.input then begin
155 (* Tab at start of line in flow context - skip tabs and check position *)
156 let start_mark = Input.mark t.input in
157 while Input.next_is (( = ) '\t') t.input do
158 ignore (Input.next t.input)
159 done;
160 (* If only tabs were used (no spaces) and column < flow_indent, error *)
161 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) &&
162 column t < t.flow_indent then
163 Error.raise_at start_mark Invalid_flow_indentation
164 end;
165 skip_to_next_token t
166 end else begin
167 ignore (Input.next t.input);
168 skip_to_next_token t
169 end
170 end
171
172(** Roll the indentation level *)
173let roll_indent t col =
174 if t.flow_level = 0 && col > current_indent t then begin
175 t.indent_stack <- { indent = col; needs_block_end = true } :: t.indent_stack;
176 true
177 end else
178 false
179
180(** Unroll indentation to given column *)
181let unroll_indent t col =
182 while t.flow_level = 0 &&
183 match t.indent_stack with
184 | { indent; needs_block_end = true; _ } :: _ when indent > col -> true
185 | _ -> false
186 do
187 match t.indent_stack with
188 | { indent = _; needs_block_end = true; _ } :: rest ->
189 let pos = Input.position t.input in
190 let span = Span.point pos in
191 emit t span Token.Block_end;
192 t.indent_stack <- rest
193 | _ -> ()
194 done
195
196(** Save a potential simple key *)
197let save_simple_key t =
198 if t.allow_simple_key then begin
199 (* A simple key is required only if we're in a block context,
200 at the current indentation level, AND the current indent needs a block end.
201 This matches saphyr's logic and prevents false positives for values. *)
202 let required = t.flow_level = 0 &&
203 match t.indent_stack with
204 | { indent; needs_block_end = true; _ } :: _ ->
205 indent = column t
206 | _ -> false
207 in
208 let sk = {
209 sk_possible = true;
210 sk_required = required;
211 sk_token_number = t.token_number;
212 sk_position = Input.position t.input;
213 } in
214 (* Remove any existing simple key at current level *)
215 t.simple_keys <- (
216 match t.simple_keys with
217 | _ :: rest -> Some sk :: rest
218 | [] -> [Some sk]
219 )
220 end
221
222(** Remove simple key at current level *)
223let remove_simple_key t =
224 match t.simple_keys with
225 | Some sk :: _rest when sk.sk_required ->
226 Error.raise_at sk.sk_position Expected_key
227 | _ :: rest -> t.simple_keys <- None :: rest
228 | [] -> ()
229
230(** Stale simple keys that span too many tokens *)
231let stale_simple_keys t =
232 t.simple_keys <- List.map (fun sk_opt ->
233 match sk_opt with
234 | Some sk when sk.sk_possible &&
235 (Input.position t.input).line > sk.sk_position.line &&
236 t.flow_level = 0 ->
237 if sk.sk_required then
238 Error.raise_at sk.sk_position Expected_key;
239 None
240 | _ -> sk_opt
241 ) t.simple_keys
242
243(** Read anchor or alias name *)
244let scan_anchor_alias t =
245 let start = Input.mark t.input in
246 let buf = Buffer.create 16 in
247 (* Per YAML 1.2 spec: anchor names can contain any character that is NOT:
248 - Whitespace (space, tab, line breaks)
249 - Flow indicators: []{}
250 - Comma (,)
251 This matches the saphyr implementation: is_yaml_non_space && !is_flow *)
252 while
253 match Input.peek t.input with
254 | Some c when not (Input.is_whitespace c) &&
255 not (Input.is_flow_indicator c) &&
256 c <> '\x00' ->
257 Buffer.add_char buf c;
258 ignore (Input.next t.input);
259 true
260 | _ -> false
261 do () done;
262 let name = Buffer.contents buf in
263 if String.length name = 0 then
264 Error.raise_at start (Invalid_anchor "empty anchor name");
265 (name, Span.make ~start ~stop:(Input.mark t.input))
266
267(** Scan tag handle *)
268let scan_tag_handle t =
269 let start = Input.mark t.input in
270 let buf = Buffer.create 16 in
271 (* Expect ! *)
272 (match Input.peek t.input with
273 | Some '!' ->
274 Buffer.add_char buf '!';
275 ignore (Input.next t.input)
276 | _ -> Error.raise_at start (Invalid_tag "expected '!'"));
277 (* Read word chars *)
278 while
279 match Input.peek t.input with
280 | Some c when Input.is_alnum c || c = '-' ->
281 Buffer.add_char buf c;
282 ignore (Input.next t.input);
283 true
284 | _ -> false
285 do () done;
286 (* Check for secondary ! *)
287 (match Input.peek t.input with
288 | Some '!' ->
289 Buffer.add_char buf '!';
290 ignore (Input.next t.input)
291 | _ -> ());
292 Buffer.contents buf
293
294(** Scan tag suffix (after handle) *)
295let scan_tag_suffix t =
296 let is_hex_digit c =
297 (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
298 in
299 let hex_val c =
300 match c with
301 | '0'..'9' -> Char.code c - Char.code '0'
302 | 'A'..'F' -> Char.code c - Char.code 'A' + 10
303 | 'a'..'f' -> Char.code c - Char.code 'a' + 10
304 | _ -> 0
305 in
306 let buf = Buffer.create 32 in
307 while
308 match Input.peek t.input with
309 | Some '%' ->
310 (* Percent-encoded character *)
311 ignore (Input.next t.input);
312 (match Input.peek t.input, Input.peek_nth t.input 1 with
313 | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 ->
314 ignore (Input.next t.input);
315 ignore (Input.next t.input);
316 let code = (hex_val c1) * 16 + (hex_val c2) in
317 Buffer.add_char buf (Char.chr code);
318 true
319 | _ ->
320 (* Invalid percent encoding - keep the % *)
321 Buffer.add_char buf '%';
322 true)
323 | Some c when not (Input.is_whitespace c) &&
324 not (Input.is_flow_indicator c) ->
325 Buffer.add_char buf c;
326 ignore (Input.next t.input);
327 true
328 | _ -> false
329 do () done;
330 Buffer.contents buf
331
332(** Scan a tag *)
333let scan_tag t =
334 let start = Input.mark t.input in
335 ignore (Input.next t.input); (* consume ! *)
336 let handle, suffix =
337 match Input.peek t.input with
338 | Some '<' ->
339 (* Verbatim tag: !<...> - handle is empty, suffix is full URI *)
340 ignore (Input.next t.input);
341 let buf = Buffer.create 32 in
342 while
343 match Input.peek t.input with
344 | Some '>' -> false
345 | Some c ->
346 Buffer.add_char buf c;
347 ignore (Input.next t.input);
348 true
349 | None -> Error.raise_at (Input.mark t.input) (Invalid_tag "unclosed verbatim tag")
350 do () done;
351 ignore (Input.next t.input); (* consume > *)
352 ("", Buffer.contents buf)
353 | Some c when Input.is_whitespace c || Input.is_flow_indicator c ->
354 (* Non-specific tag: ! *)
355 ("!", "")
356 | Some '!' ->
357 (* Secondary handle: !! *)
358 ignore (Input.next t.input); (* consume second ! *)
359 let suffix = scan_tag_suffix t in
360 ("!!", suffix)
361 | _ ->
362 (* Primary handle or just suffix: !foo or !e!foo *)
363 (* Read alphanumeric characters *)
364 let buf = Buffer.create 16 in
365 while
366 match Input.peek t.input with
367 | Some c when Input.is_alnum c || c = '-' ->
368 Buffer.add_char buf c;
369 ignore (Input.next t.input);
370 true
371 | _ -> false
372 do () done;
373 (* Check if next character is ! - if so, this is a named handle *)
374 (match Input.peek t.input with
375 | Some '!' ->
376 (* Named handle like !e! *)
377 ignore (Input.next t.input);
378 let handle_name = Buffer.contents buf in
379 let suffix = scan_tag_suffix t in
380 ("!" ^ handle_name ^ "!", suffix)
381 | _ ->
382 (* Just ! followed by suffix *)
383 ("!", Buffer.contents buf ^ scan_tag_suffix t))
384 in
385 (* Validate that tag is followed by whitespace, break, or (in flow) flow indicator *)
386 (match Input.peek t.input with
387 | None -> () (* EOF is ok *)
388 | Some c when Input.is_whitespace c || Input.is_break c -> ()
389 | Some c when t.flow_level > 0 && Input.is_flow_indicator c -> ()
390 | _ -> Error.raise_at start (Invalid_tag "expected whitespace or line break after tag"));
391 let span = Span.make ~start ~stop:(Input.mark t.input) in
392 (handle, suffix, span)
393
394(** Scan single-quoted scalar *)
395let scan_single_quoted t =
396 let start = Input.mark t.input in
397 ignore (Input.next t.input); (* consume opening single-quote *)
398 let buf = Buffer.create 64 in
399 let whitespace = Buffer.create 16 in (* Track trailing whitespace *)
400
401 let flush_whitespace () =
402 if Buffer.length whitespace > 0 then begin
403 Buffer.add_buffer buf whitespace;
404 Buffer.clear whitespace
405 end
406 in
407
408 let rec loop () =
409 match Input.peek t.input with
410 | None -> Error.raise_at start Unclosed_single_quote
411 | Some '\'' ->
412 ignore (Input.next t.input);
413 (* Check for escaped quote ('') *)
414 (match Input.peek t.input with
415 | Some '\'' ->
416 flush_whitespace ();
417 Buffer.add_char buf '\'';
418 ignore (Input.next t.input);
419 loop ()
420 | _ ->
421 (* End of string - flush any trailing whitespace *)
422 flush_whitespace ())
423 | Some ' ' | Some '\t' ->
424 (* Track whitespace - don't add to buf yet *)
425 Buffer.add_char whitespace (Option.get (Input.peek t.input));
426 ignore (Input.next t.input);
427 loop ()
428 | Some '\n' | Some '\r' ->
429 (* Discard trailing whitespace before line break *)
430 Buffer.clear whitespace;
431 Input.consume_break t.input;
432 (* Skip leading whitespace on next line *)
433 while Input.next_is_blank t.input do
434 ignore (Input.next t.input)
435 done;
436 (* Check for document boundary *)
437 if Input.at_document_boundary t.input then
438 Error.raise_at start Unclosed_single_quote;
439 (* Check indentation: continuation must be > block indent (QB6E, DK95) *)
440 let col = column t in
441 let indent = current_indent t in
442 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
443 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
444 (* Count empty lines (consecutive line breaks) *)
445 let empty_lines = ref 0 in
446 while Input.next_is_break t.input do
447 incr empty_lines;
448 Input.consume_break t.input;
449 while Input.next_is_blank t.input do
450 ignore (Input.next t.input)
451 done;
452 if Input.at_document_boundary t.input then
453 Error.raise_at start Unclosed_single_quote;
454 (* Check indentation after each empty line too *)
455 let col = column t in
456 let indent = current_indent t in
457 if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
458 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar")
459 done;
460 (* Apply folding rules *)
461 if !empty_lines > 0 then begin
462 (* Empty lines: preserve as newlines *)
463 for _ = 1 to !empty_lines do
464 Buffer.add_char buf '\n'
465 done
466 end else
467 (* Single break: fold to space (even at start of string) *)
468 Buffer.add_char buf ' ';
469 loop ()
470 | Some c ->
471 flush_whitespace ();
472 Buffer.add_char buf c;
473 ignore (Input.next t.input);
474 loop ()
475 in
476 loop ();
477 let span = Span.make ~start ~stop:(Input.mark t.input) in
478 (Buffer.contents buf, span)
479
480(** Decode hex escape of given length *)
481let decode_hex t len =
482 let start = Input.mark t.input in
483 let buf = Buffer.create len in
484 for _ = 1 to len do
485 match Input.peek t.input with
486 | Some c when Input.is_hex c ->
487 Buffer.add_char buf c;
488 ignore (Input.next t.input)
489 | _ ->
490 Error.raise_at start (Invalid_hex_escape (Buffer.contents buf))
491 done;
492 let code = int_of_string ("0x" ^ Buffer.contents buf) in
493 if code <= 0x7F then
494 String.make 1 (Char.chr code)
495 else if code <= 0x7FF then
496 let b1 = 0xC0 lor (code lsr 6) in
497 let b2 = 0x80 lor (code land 0x3F) in
498 String.init 2 (fun i -> Char.chr (if i = 0 then b1 else b2))
499 else if code <= 0xFFFF then
500 let b1 = 0xE0 lor (code lsr 12) in
501 let b2 = 0x80 lor ((code lsr 6) land 0x3F) in
502 let b3 = 0x80 lor (code land 0x3F) in
503 String.init 3 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | _ -> b3))
504 else
505 let b1 = 0xF0 lor (code lsr 18) in
506 let b2 = 0x80 lor ((code lsr 12) land 0x3F) in
507 let b3 = 0x80 lor ((code lsr 6) land 0x3F) in
508 let b4 = 0x80 lor (code land 0x3F) in
509 String.init 4 (fun i -> Char.chr (match i with 0 -> b1 | 1 -> b2 | 2 -> b3 | _ -> b4))
510
511(** Scan double-quoted scalar *)
512let scan_double_quoted t =
513 let start = Input.mark t.input in
514 ignore (Input.next t.input); (* consume opening double-quote *)
515 let buf = Buffer.create 64 in
516 let whitespace = Buffer.create 16 in (* Track pending whitespace *)
517
518 let flush_whitespace () =
519 if Buffer.length whitespace > 0 then begin
520 Buffer.add_buffer buf whitespace;
521 Buffer.clear whitespace
522 end
523 in
524
525 let rec loop () =
526 match Input.peek t.input with
527 | None -> Error.raise_at start Unclosed_double_quote
528 | Some '"' ->
529 (* Flush trailing whitespace before closing quote to preserve it *)
530 flush_whitespace ();
531 ignore (Input.next t.input)
532 | Some ' ' | Some '\t' as c_opt ->
533 (* Track whitespace - don't add to buf yet *)
534 let c = match c_opt with Some c -> c | None -> assert false in
535 Buffer.add_char whitespace c;
536 ignore (Input.next t.input);
537 loop ()
538 | Some '\\' ->
539 (* Escape sequence - this is non-whitespace content *)
540 flush_whitespace (); (* Commit any pending whitespace *)
541 ignore (Input.next t.input);
542 (match Input.peek t.input with
543 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>")
544 | Some '0' -> Buffer.add_char buf '\x00'; ignore (Input.next t.input)
545 | Some 'a' -> Buffer.add_char buf '\x07'; ignore (Input.next t.input)
546 | Some 'b' -> Buffer.add_char buf '\x08'; ignore (Input.next t.input)
547 | Some 't' | Some '\t' -> Buffer.add_char buf '\t'; ignore (Input.next t.input)
548 | Some 'n' -> Buffer.add_char buf '\n'; ignore (Input.next t.input)
549 | Some 'v' -> Buffer.add_char buf '\x0B'; ignore (Input.next t.input)
550 | Some 'f' -> Buffer.add_char buf '\x0C'; ignore (Input.next t.input)
551 | Some 'r' -> Buffer.add_char buf '\r'; ignore (Input.next t.input)
552 | Some 'e' -> Buffer.add_char buf '\x1B'; ignore (Input.next t.input)
553 | Some ' ' -> Buffer.add_char buf ' '; ignore (Input.next t.input)
554 | Some '"' -> Buffer.add_char buf '"'; ignore (Input.next t.input)
555 | Some '/' -> Buffer.add_char buf '/'; ignore (Input.next t.input)
556 | Some '\\' -> Buffer.add_char buf '\\'; ignore (Input.next t.input)
557 | Some 'N' -> Buffer.add_string buf "\xC2\x85"; ignore (Input.next t.input) (* NEL *)
558 | Some '_' -> Buffer.add_string buf "\xC2\xA0"; ignore (Input.next t.input) (* NBSP *)
559 | Some 'L' -> Buffer.add_string buf "\xE2\x80\xA8"; ignore (Input.next t.input) (* LS *)
560 | Some 'P' -> Buffer.add_string buf "\xE2\x80\xA9"; ignore (Input.next t.input) (* PS *)
561 | Some 'x' ->
562 ignore (Input.next t.input);
563 Buffer.add_string buf (decode_hex t 2)
564 | Some 'u' ->
565 ignore (Input.next t.input);
566 Buffer.add_string buf (decode_hex t 4)
567 | Some 'U' ->
568 ignore (Input.next t.input);
569 Buffer.add_string buf (decode_hex t 8)
570 | Some '\n' | Some '\r' ->
571 (* Line continuation escape *)
572 Input.consume_break t.input;
573 while Input.next_is_blank t.input do
574 ignore (Input.next t.input)
575 done
576 | Some c ->
577 Error.raise_at (Input.mark t.input)
578 (Invalid_escape_sequence (Printf.sprintf "\\%c" c)));
579 loop ()
580 | Some '\n' | Some '\r' ->
581 (* Line break: discard any pending trailing whitespace *)
582 Buffer.clear whitespace;
583 Input.consume_break t.input;
584 (* Count consecutive line breaks (empty lines) *)
585 let empty_lines = ref 0 in
586 let continue = ref true in
587 let started_with_tab = ref false in
588 while !continue do
589 (* Track if we start with a tab (for DK95/01 check) *)
590 if Input.next_is (( = ) '\t') t.input then started_with_tab := true;
591 (* Skip blanks (spaces/tabs) on the line *)
592 while Input.next_is_blank t.input do
593 ignore (Input.next t.input)
594 done;
595 (* Check if we hit another line break (empty line) *)
596 if Input.next_is_break t.input then begin
597 Input.consume_break t.input;
598 incr empty_lines;
599 started_with_tab := false (* Reset for next line *)
600 end else
601 continue := false
602 done;
603 (* Check for document boundary - this terminates the quoted string *)
604 if Input.at_document_boundary t.input then
605 Error.raise_at start Unclosed_double_quote;
606 (* Check indentation: continuation must be > block indent (QB6E, DK95)
607 Note: must be strictly greater than block indent, not just equal *)
608 let col = column t in
609 let indent = current_indent t in
610 let start_col = start.column in
611 (* DK95/01: if continuation started with tabs and column < start column, error *)
612 if not (Input.is_eof t.input) && !started_with_tab && col < start_col then
613 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
614 if not (Input.is_eof t.input) && col <= indent && indent >= 0 then
615 Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
616 (* Per YAML spec: single break = space, break + empty lines = newlines *)
617 if !empty_lines > 0 then begin
618 (* Empty lines: output N newlines where N = number of empty lines *)
619 for _ = 1 to !empty_lines do
620 Buffer.add_char buf '\n'
621 done
622 end else
623 (* Single break folds to space *)
624 Buffer.add_char buf ' ';
625 loop ()
626 | Some c ->
627 (* Non-whitespace character *)
628 flush_whitespace (); (* Commit any pending whitespace *)
629 Buffer.add_char buf c;
630 ignore (Input.next t.input);
631 loop ()
632 in
633 loop ();
634 let span = Span.make ~start ~stop:(Input.mark t.input) in
635 (Buffer.contents buf, span)
636
637(** Check if character can appear in plain scalar at this position *)
638let can_continue_plain t c ~in_flow =
639 match c with
640 | ':' ->
641 (* : is OK if not followed by whitespace or flow indicator *)
642 (match Input.peek_nth t.input 1 with
643 | None -> true
644 | Some c2 when Input.is_whitespace c2 -> false
645 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false
646 | _ -> true)
647 | '#' ->
648 (* # is a comment indicator only if preceded by whitespace *)
649 (* Check the previous character to determine if this is a comment *)
650 (match Input.peek_back t.input with
651 | None -> true (* At start - can't be comment indicator, allow it *)
652 | Some c when Input.is_whitespace c -> false (* Preceded by whitespace - comment *)
653 | Some c when Input.is_break c -> false (* At start of line - comment *)
654 | _ -> true) (* Not preceded by whitespace - part of scalar *)
655 | c when in_flow && Input.is_flow_indicator c -> false
656 | _ when Input.is_break c -> false
657 | _ -> true
658
659(** Scan plain scalar *)
660let scan_plain_scalar t =
661 let start = Input.mark t.input in
662 let in_flow = t.flow_level > 0 in
663 let indent = current_indent t in
664 (* In flow context, scalars must be indented more than the current block indent.
665 This ensures that content at block indent or less ends the flow context. *)
666 if in_flow && (column t - 1) < indent then
667 Error.raise_at start Invalid_flow_indentation;
668 let buf = Buffer.create 64 in
669 let spaces = Buffer.create 16 in
670 let whitespace = Buffer.create 16 in (* Track whitespace within a line *)
671 let leading_blanks = ref false in
672
673 let rec scan_line () =
674 match Input.peek t.input with
675 | None -> ()
676 | Some c when Input.is_blank c && can_continue_plain t c ~in_flow ->
677 (* Blank character within a line - save to whitespace buffer *)
678 Buffer.add_char whitespace c;
679 ignore (Input.next t.input);
680 scan_line ()
681 | Some c when can_continue_plain t c ~in_flow ->
682 (* Non-blank character - process any pending breaks/whitespace first *)
683 begin
684 if Buffer.length spaces > 0 then begin
685 if !leading_blanks then begin
686 (* Fold line break *)
687 if Buffer.contents spaces = "\n" then
688 Buffer.add_char buf ' '
689 else begin
690 (* Multiple breaks - preserve all but first *)
691 let s = Buffer.contents spaces in
692 Buffer.add_substring buf s 1 (String.length s - 1)
693 end
694 end else
695 Buffer.add_buffer buf spaces;
696 Buffer.clear spaces
697 end;
698 (* Add any pending whitespace from within the line *)
699 if Buffer.length whitespace > 0 then begin
700 Buffer.add_buffer buf whitespace;
701 Buffer.clear whitespace
702 end;
703 (* Add the character *)
704 Buffer.add_char buf c;
705 ignore (Input.next t.input);
706 leading_blanks := false;
707 scan_line ()
708 end
709 | _ -> ()
710 in
711
712 let rec scan_lines () =
713 scan_line ();
714 (* Check for line continuation *)
715 if Input.next_is_break t.input then begin
716 (* Discard any trailing whitespace from the current line *)
717 Buffer.clear whitespace;
718 (* Save the line break *)
719 if !leading_blanks then begin
720 (* We already had a break - this is an additional break (empty line) *)
721 Buffer.add_char spaces '\n'
722 end else begin
723 (* First line break *)
724 Buffer.clear spaces;
725 Buffer.add_char spaces '\n';
726 leading_blanks := true
727 end;
728 Input.consume_break t.input;
729 (* Note: We do NOT set allow_simple_key here during plain scalar scanning.
730 Setting it here would incorrectly allow ':' that appears on a continuation
731 line to become a mapping indicator. The flag will be set properly after
732 the scalar ends and skip_to_next_token processes line breaks. *)
733 (* Skip leading blanks on the next line *)
734 while Input.next_is_blank t.input do
735 ignore (Input.next t.input)
736 done;
737 let col = (Input.position t.input).column in
738 (* Check indentation - stop if we're at or before the containing block's indent *)
739 (* However, allow empty lines (line breaks) to continue even if dedented *)
740 if Input.next_is_break t.input then
741 scan_lines () (* Empty line - continue *)
742 else if not in_flow && col <= indent then
743 () (* Stop - dedented or at parent level in block context *)
744 else if Input.at_document_boundary t.input then
745 () (* Stop - document boundary *)
746 else
747 scan_lines ()
748 end
749 in
750
751 scan_lines ();
752 let value = Buffer.contents buf in
753 (* Trim trailing whitespace (spaces and tabs) *)
754 let value =
755 let len = String.length value in
756 let rec find_end i =
757 if i < 0 then 0
758 else match value.[i] with
759 | ' ' | '\t' -> find_end (i - 1)
760 | _ -> i + 1
761 in
762 let end_pos = find_end (len - 1) in
763 String.sub value 0 end_pos
764 in
765 let span = Span.make ~start ~stop:(Input.mark t.input) in
766 (* Return value, span, and whether we ended with leading blanks (crossed a line break) *)
767 (value, span, !leading_blanks)
768
769(** Scan block scalar (literal | or folded >) *)
770let scan_block_scalar t literal =
771 let start = Input.mark t.input in
772 ignore (Input.next t.input); (* consume | or > *)
773
774 (* Parse header: optional indentation indicator and chomping *)
775 let explicit_indent = ref None in
776 let chomping = ref Chomping.Clip in
777
778 (* First character of header *)
779 (match Input.peek t.input with
780 | Some c when Input.is_digit c && c <> '0' ->
781 explicit_indent := Some (Char.code c - Char.code '0');
782 ignore (Input.next t.input)
783 | Some '-' -> chomping := Chomping.Strip; ignore (Input.next t.input)
784 | Some '+' -> chomping := Chomping.Keep; ignore (Input.next t.input)
785 | _ -> ());
786
787 (* Second character of header *)
788 (match Input.peek t.input with
789 | Some c when Input.is_digit c && c <> '0' && !explicit_indent = None ->
790 explicit_indent := Some (Char.code c - Char.code '0');
791 ignore (Input.next t.input)
792 | Some '-' when !chomping = Chomping.Clip ->
793 chomping := Chomping.Strip; ignore (Input.next t.input)
794 | Some '+' when !chomping = Chomping.Clip ->
795 chomping := Chomping.Keep; ignore (Input.next t.input)
796 | _ -> ());
797
798 (* Skip whitespace and optional comment *)
799 skip_whitespace_and_comment t;
800
801 (* Consume line break *)
802 if Input.next_is_break t.input then
803 Input.consume_break t.input
804 else if not (Input.is_eof t.input) then
805 Error.raise_at (Input.mark t.input)
806 (Invalid_block_scalar_header "expected newline after header");
807
808 let base_indent = current_indent t in
809 (* base_indent is the indent level from the stack, -1 if empty.
810 It's used directly for comparisons in implicit indent case. *)
811 let content_indent = ref (
812 match !explicit_indent with
813 | Some n ->
814 (* Explicit indent: base_indent is 1-indexed column, convert to 0-indexed.
815 content_indent = (base_indent - 1) + n, but at least n for document level. *)
816 let base_level = max 0 (base_indent - 1) in
817 base_level + n
818 | None -> 0 (* Will be determined by first non-empty line *)
819 ) in
820
821 let buf = Buffer.create 256 in
822 let trailing_breaks = Buffer.create 16 in
823 let leading_blank = ref false in (* Was the previous line "more indented"? *)
824 let max_empty_line_indent = ref 0 in (* Track max indent of empty lines before first content *)
825
826 (* Skip to content indentation, skipping empty lines.
827 Returns the number of spaces actually skipped (important for detecting dedentation). *)
828 let rec skip_to_content_indent () =
829 if !content_indent > 0 then begin
830 (* Explicit indent - skip up to content_indent spaces *)
831 let spaces_skipped = ref 0 in
832 while !spaces_skipped < !content_indent && Input.next_is (( = ) ' ') t.input do
833 incr spaces_skipped;
834 ignore (Input.next t.input)
835 done;
836
837 (* Check if this line is empty (only spaces/tabs until break/eof) *)
838 if Input.next_is_break t.input then begin
839 (* Empty line - record the break and continue *)
840 Buffer.add_char trailing_breaks '\n';
841 Input.consume_break t.input;
842 skip_to_content_indent ()
843 end else if !spaces_skipped < !content_indent then begin
844 (* Line starts with fewer spaces than content_indent - dedented *)
845 !spaces_skipped
846 end else if Input.next_is_blank t.input then begin
847 (* Line has spaces/tabs beyond content_indent - could be whitespace content or empty line.
848 For literal scalars, whitespace-only lines ARE content (not empty).
849 For folded scalars, whitespace-only lines that are "more indented" are preserved. *)
850 if literal then
851 (* Literal: whitespace beyond content_indent is content, let read_lines handle it *)
852 !content_indent
853 else begin
854 (* Folded: check if rest is only blanks *)
855 let idx = ref 0 in
856 while match Input.peek_nth t.input !idx with
857 | Some c when Input.is_blank c -> incr idx; true
858 | _ -> false
859 do () done;
860 match Input.peek_nth t.input (!idx) with
861 | None | Some '\n' | Some '\r' ->
862 (* Empty/whitespace-only line in folded - skip spaces *)
863 while Input.next_is_blank t.input do
864 ignore (Input.next t.input)
865 done;
866 Buffer.add_char trailing_breaks '\n';
867 Input.consume_break t.input;
868 skip_to_content_indent ()
869 | _ ->
870 (* Has non-whitespace content *)
871 !content_indent
872 end
873 end else
874 !content_indent
875 end else begin
876 (* Implicit indent - skip empty lines without consuming spaces.
877 Note: Only SPACES count as indentation. Tabs are content, not indentation.
878 So we only check for spaces when determining if a line is "empty". *)
879 if Input.next_is_break t.input then begin
880 Buffer.add_char trailing_breaks '\n';
881 Input.consume_break t.input;
882 skip_to_content_indent ()
883 end else if Input.next_is (( = ) ' ') t.input then begin
884 (* Check if line is empty (only spaces before break) *)
885 let idx = ref 0 in
886 while match Input.peek_nth t.input !idx with
887 | Some ' ' -> incr idx; true
888 | _ -> false
889 do () done;
890 match Input.peek_nth t.input (!idx) with
891 | None | Some '\n' | Some '\r' ->
892 (* Line has only spaces - empty line *)
893 (* Track max indent of empty lines for later validation *)
894 if !idx > !max_empty_line_indent then
895 max_empty_line_indent := !idx;
896 while Input.next_is (( = ) ' ') t.input do
897 ignore (Input.next t.input)
898 done;
899 Buffer.add_char trailing_breaks '\n';
900 Input.consume_break t.input;
901 skip_to_content_indent ()
902 | _ ->
903 (* Has content (including tabs which are content, not indentation) *)
904 0
905 end else if Input.next_is (( = ) '\t') t.input then begin
906 (* Tab at start of line in implicit indent mode - this is an error (Y79Y)
907 because tabs cannot be used as indentation in YAML *)
908 Error.raise_at (Input.mark t.input) Tab_in_indentation
909 end else
910 (* Not at break or space - other content character *)
911 0
912 end
913 in
914
915 (* Read content *)
916 let rec read_lines () =
917 let spaces_skipped = skip_to_content_indent () in
918
919 (* Check if we're at content *)
920 if Input.is_eof t.input then ()
921 else if Input.at_document_boundary t.input then ()
922 else begin
923 (* Count additional leading spaces beyond what was skipped *)
924 let extra_spaces = ref 0 in
925 while Input.next_is (( = ) ' ') t.input do
926 incr extra_spaces;
927 ignore (Input.next t.input)
928 done;
929
930 (* Calculate actual line indentation *)
931 let line_indent = spaces_skipped + !extra_spaces in
932
933 (* Determine content indent from first content line (implicit case) *)
934 let first_line = !content_indent = 0 in
935 (* base_indent is 1-indexed column, convert to 0-indexed for comparison with line_indent.
936 If base_indent = -1 (empty stack), then base_level = -1 means col 0 is valid. *)
937 let base_level = base_indent - 1 in
938 let should_process =
939 if !content_indent = 0 then begin
940 (* For implicit indent, content must be more indented than base_level. *)
941 if line_indent <= base_level then
942 false (* No content - first line not indented enough *)
943 else begin
944 (* Validate: first content line must be indented at least as much as
945 the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *)
946 if line_indent < !max_empty_line_indent && line_indent > base_level then
947 Error.raise_at (Input.mark t.input)
948 (Invalid_block_scalar_header "wrongly indented line in block scalar");
949 content_indent := line_indent;
950 true
951 end
952 end else if line_indent < !content_indent then
953 false (* Dedented - done with content *)
954 else
955 true
956 in
957
958 if should_process then begin
959 (* Check if current line is "more indented" (has extra indent or starts with whitespace).
960 For folded scalars, lines that start with any whitespace (space or tab) after the
961 content indentation are "more indented" and preserve breaks.
962 Note: we check Input.next_is_blank BEFORE reading content to see if content starts with whitespace. *)
963 let trailing_blank = line_indent > !content_indent || Input.next_is_blank t.input in
964
965 (* Add trailing breaks to buffer *)
966 if Buffer.length buf > 0 then begin
967 if Buffer.length trailing_breaks > 0 then begin
968 if literal then
969 Buffer.add_buffer buf trailing_breaks
970 else begin
971 (* Folded scalar: fold only if both previous and current lines are not more-indented *)
972 if not !leading_blank && not trailing_blank then begin
973 let breaks = Buffer.contents trailing_breaks in
974 if String.length breaks = 1 then
975 Buffer.add_char buf ' '
976 else
977 Buffer.add_substring buf breaks 1 (String.length breaks - 1)
978 end else begin
979 (* Preserve breaks for more-indented lines *)
980 Buffer.add_buffer buf trailing_breaks
981 end
982 end
983 end else if not literal then
984 Buffer.add_char buf ' '
985 end else
986 Buffer.add_buffer buf trailing_breaks;
987 Buffer.clear trailing_breaks;
988
989 (* Add extra indentation for literal or more-indented folded lines *)
990 (* On the first line (when determining content_indent), we've already consumed all spaces,
991 so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *)
992 if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin
993 for _ = 1 to !extra_spaces do
994 Buffer.add_char buf ' '
995 done
996 end;
997
998 (* Read line content *)
999 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
1000 Buffer.add_char buf (Input.next_exn t.input)
1001 done;
1002
1003 (* Record trailing break *)
1004 if Input.next_is_break t.input then begin
1005 Buffer.add_char trailing_breaks '\n';
1006 Input.consume_break t.input
1007 end;
1008
1009 (* Update leading_blank for next iteration *)
1010 leading_blank := trailing_blank;
1011
1012 read_lines ()
1013 end
1014 end
1015 in
1016
1017 read_lines ();
1018
1019 (* Apply chomping *)
1020 let value =
1021 let content = Buffer.contents buf in
1022 match !chomping with
1023 | Chomping.Strip -> content
1024 | Chomping.Clip ->
1025 if String.length content > 0 then content ^ "\n" else content
1026 | Chomping.Keep ->
1027 content ^ Buffer.contents trailing_breaks
1028 in
1029
1030 let span = Span.make ~start ~stop:(Input.mark t.input) in
1031 let style = if literal then `Literal else `Folded in
1032 (value, style, span)
1033
1034(** Scan directive (after %) *)
1035let scan_directive t =
1036 let start = Input.mark t.input in
1037 ignore (Input.next t.input); (* consume % *)
1038
1039 (* Read directive name *)
1040 let name_buf = Buffer.create 16 in
1041 while
1042 match Input.peek t.input with
1043 | Some c when Input.is_alnum c || c = '-' ->
1044 Buffer.add_char name_buf c;
1045 ignore (Input.next t.input);
1046 true
1047 | _ -> false
1048 do () done;
1049 let name = Buffer.contents name_buf in
1050
1051 (* Skip blanks *)
1052 while Input.next_is_blank t.input do
1053 ignore (Input.next t.input)
1054 done;
1055
1056 match name with
1057 | "YAML" ->
1058 (* Version directive: %YAML 1.2 *)
1059 let major = ref 0 in
1060 let minor = ref 0 in
1061 (* Read major version *)
1062 while Input.next_is_digit t.input do
1063 major := !major * 10 + (Char.code (Input.next_exn t.input) - Char.code '0')
1064 done;
1065 (* Expect . *)
1066 (match Input.peek t.input with
1067 | Some '.' -> ignore (Input.next t.input)
1068 | _ -> Error.raise_at (Input.mark t.input) (Invalid_yaml_version "expected '.'"));
1069 (* Read minor version *)
1070 while Input.next_is_digit t.input do
1071 minor := !minor * 10 + (Char.code (Input.next_exn t.input) - Char.code '0')
1072 done;
1073 (* Validate: only whitespace and comments allowed before line break (MUS6) *)
1074 skip_whitespace_and_comment t;
1075 if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
1076 Error.raise_at (Input.mark t.input) (Invalid_directive "expected comment or line break after version");
1077 let span = Span.make ~start ~stop:(Input.mark t.input) in
1078 Token.Version_directive { major = !major; minor = !minor }, span
1079
1080 | "TAG" ->
1081 (* Tag directive: %TAG !foo! tag:example.com,2000: *)
1082 let handle = scan_tag_handle t in
1083 (* Skip blanks *)
1084 while Input.next_is_blank t.input do
1085 ignore (Input.next t.input)
1086 done;
1087 (* Read prefix *)
1088 let prefix_buf = Buffer.create 32 in
1089 while
1090 match Input.peek t.input with
1091 | Some c when not (Input.is_whitespace c) ->
1092 Buffer.add_char prefix_buf c;
1093 ignore (Input.next t.input);
1094 true
1095 | _ -> false
1096 do () done;
1097 let prefix = Buffer.contents prefix_buf in
1098 let span = Span.make ~start ~stop:(Input.mark t.input) in
1099 Token.Tag_directive { handle; prefix }, span
1100
1101 | _ ->
1102 (* Reserved/Unknown directive - skip to end of line and ignore *)
1103 (* Per YAML spec, reserved directives should be ignored with a warning *)
1104 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
1105 ignore (Input.next t.input)
1106 done;
1107 let span = Span.make ~start ~stop:(Input.mark t.input) in
1108 (* Return an empty tag directive token to indicate directive was processed but ignored *)
1109 Token.Tag_directive { handle = ""; prefix = "" }, span
1110
1111(** Fetch the next token(s) into the queue *)
1112let rec fetch_next_token t =
1113 skip_to_next_token t;
1114 stale_simple_keys t;
1115 let col = column t in
1116 (* Unroll indents that are deeper than current column.
1117 Note: we use col, not col-1, to allow entries at the same level. *)
1118 unroll_indent t col;
1119
1120 (* We're about to process actual content, not leading whitespace *)
1121 t.leading_whitespace <- false;
1122
1123 if Input.is_eof t.input then
1124 fetch_stream_end t
1125 else if Input.at_document_boundary t.input then
1126 fetch_document_indicator t
1127 else begin
1128 match Input.peek t.input with
1129 | None -> fetch_stream_end t
1130 | Some '%' when (Input.position t.input).column = 1 ->
1131 fetch_directive t
1132 | Some '[' -> fetch_flow_collection_start t Token.Flow_sequence_start
1133 | Some '{' -> fetch_flow_collection_start t Token.Flow_mapping_start
1134 | Some ']' -> fetch_flow_collection_end t Token.Flow_sequence_end
1135 | Some '}' -> fetch_flow_collection_end t Token.Flow_mapping_end
1136 | Some ',' -> fetch_flow_entry t
1137 | Some '-' when t.flow_level = 0 && check_block_entry t ->
1138 fetch_block_entry t
1139 | Some '?' when check_key t ->
1140 fetch_key t
1141 | Some ':' when check_value t ->
1142 fetch_value t
1143 | Some '*' -> fetch_alias t
1144 | Some '&' -> fetch_anchor t
1145 | Some '!' -> fetch_tag t
1146 | Some '|' when t.flow_level = 0 -> fetch_block_scalar t true
1147 | Some '>' when t.flow_level = 0 -> fetch_block_scalar t false
1148 | Some '\'' -> fetch_single_quoted t
1149 | Some '"' -> fetch_double_quoted t
1150 | Some '-' when can_start_plain t ->
1151 fetch_plain_scalar t
1152 | Some '?' when can_start_plain t ->
1153 fetch_plain_scalar t
1154 | Some ':' when can_start_plain t ->
1155 fetch_plain_scalar t
1156 | Some c when can_start_plain_char c t ->
1157 fetch_plain_scalar t
1158 | Some c ->
1159 Error.raise_at (Input.mark t.input) (Unexpected_character c)
1160 end
1161
1162and fetch_stream_end t =
1163 if not t.stream_ended then begin
1164 unroll_indent t (-1);
1165 remove_simple_key t;
1166 t.allow_simple_key <- false;
1167 t.stream_ended <- true;
1168 let span = Span.point (Input.mark t.input) in
1169 emit t span Token.Stream_end
1170 end
1171
1172and fetch_document_indicator t =
1173 unroll_indent t (-1);
1174 remove_simple_key t;
1175 t.allow_simple_key <- false;
1176 let start = Input.mark t.input in
1177 let indicator = Input.peek_string t.input 3 in
1178 Input.skip t.input 3;
1179 let span = Span.make ~start ~stop:(Input.mark t.input) in
1180 let token = if indicator = "---" then Token.Document_start else Token.Document_end in
1181 (* Reset document content flag after document end marker *)
1182 if indicator = "..." then begin
1183 t.document_has_content <- false;
1184 (* After document end marker, skip whitespace and check for end of line or comment *)
1185 while Input.next_is_blank t.input do ignore (Input.next t.input) done;
1186 (match Input.peek t.input with
1187 | None -> () (* EOF is ok *)
1188 | Some c when Input.is_break c -> ()
1189 | Some '#' -> () (* Comment is ok *)
1190 | _ -> Error.raise_at start (Invalid_directive "content not allowed after document end marker on same line"))
1191 end;
1192 emit t span token
1193
1194and fetch_directive t =
1195 (* Directives can only appear:
1196 1. At stream start (before any document content)
1197 2. After a document end marker (...)
1198 If we've emitted content in the current document, we need a document end marker first *)
1199 if t.document_has_content then
1200 Error.raise_at (Input.mark t.input)
1201 (Unexpected_token "directives must be separated from document content by document end marker (...)");
1202 unroll_indent t (-1);
1203 remove_simple_key t;
1204 t.allow_simple_key <- false;
1205 let token, span = scan_directive t in
1206 emit t span token
1207
1208and fetch_flow_collection_start t token_type =
1209 save_simple_key t;
1210 (* Record indent of outermost flow collection *)
1211 if t.flow_level = 0 then
1212 t.flow_indent <- column t;
1213 t.flow_level <- t.flow_level + 1;
1214 (* Track whether this is a mapping or sequence *)
1215 let is_mapping = (token_type = Token.Flow_mapping_start) in
1216 t.flow_mapping_stack <- is_mapping :: t.flow_mapping_stack;
1217 t.allow_simple_key <- true;
1218 t.simple_keys <- None :: t.simple_keys;
1219 t.document_has_content <- true;
1220 let start = Input.mark t.input in
1221 ignore (Input.next t.input);
1222 let span = Span.make ~start ~stop:(Input.mark t.input) in
1223 emit t span token_type
1224
1225and fetch_flow_collection_end t token_type =
1226 remove_simple_key t;
1227 t.flow_level <- t.flow_level - 1;
1228 t.flow_mapping_stack <- (match t.flow_mapping_stack with _ :: rest -> rest | [] -> []);
1229 t.simple_keys <- (match t.simple_keys with _ :: rest -> rest | [] -> []);
1230 t.allow_simple_key <- false;
1231 let start = Input.mark t.input in
1232 ignore (Input.next t.input);
1233 (* Allow adjacent values after flow collection ends *)
1234 if t.flow_level > 0 then
1235 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1236 let span = Span.make ~start ~stop:(Input.mark t.input) in
1237 emit t span token_type
1238
1239and fetch_flow_entry t =
1240 remove_simple_key t;
1241 t.allow_simple_key <- true;
1242 let start = Input.mark t.input in
1243 ignore (Input.next t.input);
1244 let span = Span.make ~start ~stop:(Input.mark t.input) in
1245 emit t span Token.Flow_entry
1246
1247and check_block_entry t =
1248 (* - followed by whitespace or EOF *)
1249 match Input.peek_nth t.input 1 with
1250 | None -> true
1251 | Some c -> Input.is_whitespace c
1252
1253and fetch_block_entry t =
1254 if t.flow_level = 0 then begin
1255 (* Block entries require allow_simple_key to be true.
1256 This prevents block sequences on the same line as a mapping value,
1257 e.g., "key: - a" is invalid. *)
1258 if not t.allow_simple_key then
1259 Error.raise_at (Input.mark t.input) Block_sequence_disallowed;
1260 let col = column t in
1261 if roll_indent t col then begin
1262 let span = Span.point (Input.mark t.input) in
1263 emit t span Token.Block_sequence_start
1264 end
1265 end;
1266 remove_simple_key t;
1267 t.allow_simple_key <- true;
1268 t.document_has_content <- true;
1269 let start = Input.mark t.input in
1270 ignore (Input.next t.input);
1271
1272 (* Check for tabs after - : pattern like -\t- is invalid *)
1273 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
1274 if found_tabs then begin
1275 (* If we found tabs and next char is - followed by whitespace, error *)
1276 match Input.peek t.input with
1277 | Some '-' ->
1278 (match Input.peek_nth t.input 1 with
1279 | None -> Error.raise_at start Tab_in_indentation
1280 | Some c when Input.is_whitespace c ->
1281 Error.raise_at start Tab_in_indentation
1282 | Some _ -> ())
1283 | _ -> ()
1284 end;
1285
1286 let span = Span.make ~start ~stop:(Input.mark t.input) in
1287 emit t span Token.Block_entry
1288
1289and check_key t =
1290 (* ? followed by whitespace or flow indicator in both block and flow *)
1291 match Input.peek_nth t.input 1 with
1292 | None -> true
1293 | Some c ->
1294 Input.is_whitespace c ||
1295 (t.flow_level > 0 && Input.is_flow_indicator c)
1296
1297and fetch_key t =
1298 if t.flow_level = 0 then begin
1299 if not t.allow_simple_key then
1300 Error.raise_at (Input.mark t.input) Expected_key;
1301 let col = column t in
1302 if roll_indent t col then begin
1303 let span = Span.point (Input.mark t.input) in
1304 emit t span Token.Block_mapping_start
1305 end
1306 end;
1307 remove_simple_key t;
1308 t.allow_simple_key <- t.flow_level = 0;
1309 t.document_has_content <- true;
1310 let start = Input.mark t.input in
1311 ignore (Input.next t.input);
1312
1313 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *)
1314 let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
1315 if found_tabs && t.flow_level = 0 then begin
1316 (* In block context, tabs after ? are not allowed *)
1317 Error.raise_at start Tab_in_indentation
1318 end;
1319
1320 let span = Span.make ~start ~stop:(Input.mark t.input) in
1321 emit t span Token.Key
1322
1323and check_value t =
1324 (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *)
1325 match Input.peek_nth t.input 1 with
1326 | None -> true
1327 | Some c ->
1328 Input.is_whitespace c ||
1329 (t.flow_level > 0 && Input.is_flow_indicator c) ||
1330 (* Allow adjacent values in flow context at designated positions *)
1331 (t.flow_level > 0 &&
1332 match t.adjacent_value_allowed_at with
1333 | Some pos -> pos.Position.line = (Input.position t.input).Position.line &&
1334 pos.Position.column = (Input.position t.input).Position.column
1335 | None -> false)
1336
1337and fetch_value t =
1338 let start = Input.mark t.input in
1339 (* Check for simple key *)
1340 let used_simple_key =
1341 match t.simple_keys with
1342 | Some sk :: _ when sk.sk_possible ->
1343 (* In implicit flow mapping (inside a flow sequence), key and : must be on the same line.
1344 In explicit flow mapping { }, key and : can span lines. *)
1345 let is_implicit_flow_mapping = match t.flow_mapping_stack with
1346 | false :: _ -> true (* false = we're in a sequence, so any mapping is implicit *)
1347 | _ -> false
1348 in
1349 if is_implicit_flow_mapping && sk.sk_position.line < (Input.position t.input).line then
1350 Error.raise_at start Illegal_flow_key_line;
1351 (* Insert KEY token before the simple key value *)
1352 let key_span = Span.point sk.sk_position in
1353 let key_token = { Token.token = Token.Key; span = key_span } in
1354 (* We need to insert at the right position *)
1355 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1356 Queue.clear t.tokens;
1357 let insert_pos = sk.sk_token_number - t.tokens_taken in
1358 Array.iteri (fun i tok ->
1359 if i = insert_pos then Queue.add key_token t.tokens;
1360 Queue.add tok t.tokens
1361 ) tokens;
1362 if insert_pos >= Array.length tokens then
1363 Queue.add key_token t.tokens;
1364 t.token_number <- t.token_number + 1;
1365 (* Roll indent for implicit block mapping *)
1366 if t.flow_level = 0 then begin
1367 let col = sk.sk_position.column in
1368 if roll_indent t col then begin
1369 let span = Span.point sk.sk_position in
1370 (* Insert block mapping start before key *)
1371 let bm_token = { Token.token = Token.Block_mapping_start; span } in
1372 let tokens = Queue.to_seq t.tokens |> Array.of_seq in
1373 Queue.clear t.tokens;
1374 Array.iteri (fun i tok ->
1375 if i = insert_pos then Queue.add bm_token t.tokens;
1376 Queue.add tok t.tokens
1377 ) tokens;
1378 if insert_pos >= Array.length tokens then
1379 Queue.add bm_token t.tokens;
1380 t.token_number <- t.token_number + 1
1381 end
1382 end;
1383 t.simple_keys <- None :: (List.tl t.simple_keys);
1384 true
1385 | _ ->
1386 (* No simple key - this is a complex value (or empty key) *)
1387 if t.flow_level = 0 then begin
1388 if not t.allow_simple_key then
1389 Error.raise_at (Input.mark t.input) Expected_key;
1390 let col = column t in
1391 if roll_indent t col then begin
1392 let span = Span.point (Input.mark t.input) in
1393 emit t span Token.Block_mapping_start
1394 end
1395 (* Note: We don't emit KEY here. Empty key handling is done by the parser,
1396 which emits empty scalar when it sees VALUE without preceding KEY. *)
1397 end;
1398 false
1399 in
1400 remove_simple_key t;
1401 (* In block context without simple key, allow simple keys for compact mappings like ": moon: white"
1402 In flow context or after using a simple key, disallow simple keys *)
1403 t.allow_simple_key <- (not used_simple_key) && (t.flow_level = 0);
1404 t.document_has_content <- true;
1405 let start = Input.mark t.input in
1406 ignore (Input.next t.input);
1407
1408 (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09)
1409 However, :\t bar (tab followed by space then content) is valid (6BCT) *)
1410 let (found_tabs, found_spaces) = skip_blanks_check_tabs t in
1411 if found_tabs && not found_spaces && t.flow_level = 0 then begin
1412 (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *)
1413 match Input.peek t.input with
1414 | Some ('-' | '?') ->
1415 Error.raise_at start Tab_in_indentation
1416 | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ->
1417 (* Tab-only followed by alphanumeric - likely a key, which is invalid *)
1418 Error.raise_at start Tab_in_indentation
1419 | _ -> ()
1420 end;
1421
1422 (* Skip any comment that may follow the colon and whitespace *)
1423 skip_whitespace_and_comment t;
1424
1425 let span = Span.make ~start ~stop:(Input.mark t.input) in
1426 emit t span Token.Value
1427
1428and fetch_anchor_or_alias t ~is_alias =
1429 save_simple_key t;
1430 t.allow_simple_key <- false;
1431 t.document_has_content <- true;
1432 let start = Input.mark t.input in
1433 ignore (Input.next t.input); (* consume * or & *)
1434 let name, span = scan_anchor_alias t in
1435 let span = Span.make ~start ~stop:span.stop in
1436 let token = if is_alias then Token.Alias name else Token.Anchor name in
1437 emit t span token
1438
1439and fetch_alias t = fetch_anchor_or_alias t ~is_alias:true
1440and fetch_anchor t = fetch_anchor_or_alias t ~is_alias:false
1441
1442and fetch_tag t =
1443 save_simple_key t;
1444 t.allow_simple_key <- false;
1445 t.document_has_content <- true;
1446 let handle, suffix, span = scan_tag t in
1447 emit t span (Token.Tag { handle; suffix })
1448
1449and fetch_block_scalar t literal =
1450 remove_simple_key t;
1451 t.allow_simple_key <- true;
1452 t.document_has_content <- true;
1453 let value, style, span = scan_block_scalar t literal in
1454 emit t span (Token.Scalar { style; value })
1455
1456and fetch_quoted t ~double =
1457 save_simple_key t;
1458 t.allow_simple_key <- false;
1459 t.document_has_content <- true;
1460 let value, span =
1461 if double then scan_double_quoted t else scan_single_quoted t
1462 in
1463 (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
1464 skip_to_next_token t;
1465 if t.flow_level > 0 then
1466 t.adjacent_value_allowed_at <- Some (Input.position t.input);
1467 let style = if double then `Double_quoted else `Single_quoted in
1468 emit t span (Token.Scalar { style; value })
1469
1470and fetch_single_quoted t = fetch_quoted t ~double:false
1471and fetch_double_quoted t = fetch_quoted t ~double:true
1472
1473and can_start_plain t =
1474 (* Check if - ? : can start a plain scalar *)
1475 match Input.peek_nth t.input 1 with
1476 | None -> false
1477 | Some c ->
1478 not (Input.is_whitespace c) &&
1479 (t.flow_level = 0 || not (Input.is_flow_indicator c))
1480
1481and can_start_plain_char c _t =
1482 (* Characters that can start a plain scalar *)
1483 if Input.is_whitespace c then false
1484 else if Input.is_indicator c then false
1485 else true
1486
1487and fetch_plain_scalar t =
1488 save_simple_key t;
1489 t.allow_simple_key <- false;
1490 t.document_has_content <- true;
1491 let value, span, ended_with_linebreak = scan_plain_scalar t in
1492 (* If the plain scalar ended after crossing a line break (leading_blanks = true),
1493 allow simple keys. This is important because the scanner already consumed the
1494 line break and leading whitespace when checking for continuation. *)
1495 if ended_with_linebreak then
1496 t.allow_simple_key <- true;
1497 emit t span (Token.Scalar { style = `Plain; value })
1498
1499(** Check if we need more tokens to resolve simple keys *)
1500let need_more_tokens t =
1501 if t.stream_ended then false
1502 else if Queue.is_empty t.tokens then true
1503 else
1504 (* Check if any simple key could affect the first queued token *)
1505 List.exists (function
1506 | Some sk when sk.sk_possible ->
1507 sk.sk_token_number >= t.tokens_taken
1508 | _ -> false
1509 ) t.simple_keys
1510
1511(** Ensure we have enough tokens to return one safely *)
1512let ensure_tokens t =
1513 if not t.stream_started then begin
1514 t.stream_started <- true;
1515 let span = Span.point (Input.position t.input) in
1516 let encoding, _ = Encoding.detect (Input.source t.input) in
1517 emit t span (Token.Stream_start encoding)
1518 end;
1519 while need_more_tokens t do
1520 fetch_next_token t
1521 done
1522
1523(** Get next token *)
1524let next t =
1525 ensure_tokens t;
1526 if Queue.is_empty t.tokens then
1527 None
1528 else begin
1529 t.tokens_taken <- t.tokens_taken + 1;
1530 Some (Queue.pop t.tokens)
1531 end
1532
1533(** Peek at next token *)
1534let peek t =
1535 ensure_tokens t;
1536 Queue.peek_opt t.tokens
1537
1538(** Iterate over all tokens *)
1539let iter f t =
1540 let rec loop () =
1541 match next t with
1542 | None -> ()
1543 | Some tok -> f tok; loop ()
1544 in
1545 loop ()
1546
1547(** Fold over all tokens *)
1548let fold f init t =
1549 let rec loop acc =
1550 match next t with
1551 | None -> acc
1552 | Some tok -> loop (f acc tok)
1553 in
1554 loop init
1555
1556(** Convert to list *)
1557let to_list t =
1558 fold (fun acc tok -> tok :: acc) [] t |> List.rev