My agentic slop goes here. Not intended for anyone else!
1(*
2 * Copyright (c) 2014, OCaml.org project
3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 *)
17
18(** Internal utility for HTML to Markdown conversion *)
19
20[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
21
22(** HTML to Markdown converter using Lambda Soup *)
23
24(** Extract all links from HTML content *)
25let extract_links html_str =
26 try
27 let soup = Soup.parse html_str in
28 let links = Soup.select "a[href]" soup in
29 Soup.fold (fun acc link ->
30 match Soup.attribute "href" link with
31 | Some href ->
32 (* Filter out local anchors and only include absolute external URLs *)
33 let uri = Uri.of_string href in
34 let is_absolute = Uri.scheme uri <> None in
35 let is_local_anchor = String.starts_with ~prefix:"#" href in
36 if is_absolute && not is_local_anchor then
37 let text = Soup.texts link |> String.concat "" |> String.trim in
38 (href, text) :: acc
39 else
40 acc
41 | None -> acc
42 ) [] links
43 |> List.rev
44 with _ -> []
45
46(** Check if string contains any whitespace *)
47let has_whitespace s =
48 try
49 let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
50 true
51 with Not_found -> false
52
53(** Clean up excessive newlines and normalize spacing *)
54let cleanup_markdown s =
55 (* Normalize line endings *)
56 let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
57
58 (* Remove trailing whitespace from each line *)
59 let lines = String.split_on_char '\n' s in
60 let lines = List.map (fun line ->
61 (* Trim trailing spaces but preserve leading spaces for indentation *)
62 let len = String.length line in
63 let rec find_last_non_space i =
64 if i < 0 then -1
65 else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
66 else i
67 in
68 let last = find_last_non_space (len - 1) in
69 if last < 0 then ""
70 else String.sub line 0 (last + 1)
71 ) lines in
72
73 (* Join back and collapse excessive blank lines *)
74 let s = String.concat "\n" lines in
75
76 (* Replace 3+ consecutive newlines with just 2 *)
77 let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
78
79 (* Trim leading and trailing whitespace *)
80 String.trim s
81
82(** Convert HTML to Markdown using state-based whitespace handling *)
83let html_to_markdown html_str =
84 try
85 let soup = Soup.parse html_str in
86 let buffer = Buffer.create 256 in
87
88 (* State: track if we need to insert a space before next text *)
89 let need_space = ref false in
90
91 (* Get last character in buffer, if any *)
92 let last_char () =
93 let len = Buffer.length buffer in
94 if len = 0 then None
95 else Some (Buffer.nth buffer (len - 1))
96 in
97
98 (* Add text with proper spacing *)
99 let add_text text =
100 let trimmed = String.trim text in
101 if trimmed <> "" then begin
102 (* Check if text starts with punctuation that shouldn't have space before it *)
103 let starts_with_punctuation =
104 String.length trimmed > 0 &&
105 (match trimmed.[0] with
106 | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
107 | _ -> false)
108 in
109
110 (* Add space if needed, unless we're before punctuation *)
111 if !need_space && not starts_with_punctuation then begin
112 match last_char () with
113 | Some (' ' | '\n') -> ()
114 | _ -> Buffer.add_char buffer ' '
115 end;
116 Buffer.add_string buffer trimmed;
117 need_space := false
118 end
119 in
120
121 (* Mark that we need space before next text (for inline elements) *)
122 let mark_space_needed () =
123 need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
124 in
125
126 (* Process header with ID/anchor handling *)
127 let process_header level elem =
128 need_space := false;
129
130 (* Check if header contains a link with an ID fragment *)
131 let link_opt = Soup.select_one "a[href]" elem in
132 let anchor_id = match link_opt with
133 | Some link ->
134 (match Soup.attribute "href" link with
135 | Some href ->
136 (* Extract fragment from URL *)
137 let uri = Uri.of_string href in
138 Uri.fragment uri
139 | None -> None)
140 | None -> None
141 in
142
143 (* Add anchor if we found an ID *)
144 (match anchor_id with
145 | Some id when id <> "" ->
146 Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
147 | _ -> ());
148
149 (* Add the header marker *)
150 let marker = String.make level '#' in
151 Buffer.add_string buffer ("\n" ^ marker ^ " ");
152
153 (* Get text content, excluding link tags *)
154 let text = Soup.texts elem |> String.concat " " |> String.trim in
155 Buffer.add_string buffer text;
156
157 Buffer.add_string buffer "\n\n";
158 need_space := false
159 in
160
161 let rec process_node node =
162 match Soup.element node with
163 | Some elem ->
164 let tag = Soup.name elem in
165 (match tag with
166 (* Block elements - reset space tracking *)
167 | "h1" -> process_header 1 elem
168 | "h2" -> process_header 2 elem
169 | "h3" -> process_header 3 elem
170 | "h4" -> process_header 4 elem
171 | "h5" -> process_header 5 elem
172 | "h6" -> process_header 6 elem
173 | "p" ->
174 need_space := false;
175 Soup.children elem |> Soup.iter process_node;
176 Buffer.add_string buffer "\n\n";
177 need_space := false
178 | "br" ->
179 Buffer.add_string buffer "\n";
180 need_space := false
181 (* Inline elements - preserve space tracking *)
182 | "strong" | "b" ->
183 (* Add space before if needed *)
184 if !need_space then begin
185 match last_char () with
186 | Some (' ' | '\n') -> ()
187 | _ -> Buffer.add_char buffer ' '
188 end;
189 Buffer.add_string buffer "**";
190 need_space := false;
191 Soup.children elem |> Soup.iter process_node;
192 Buffer.add_string buffer "**";
193 mark_space_needed ()
194 | "em" | "i" ->
195 (* Add space before if needed *)
196 if !need_space then begin
197 match last_char () with
198 | Some (' ' | '\n') -> ()
199 | _ -> Buffer.add_char buffer ' '
200 end;
201 Buffer.add_string buffer "*";
202 need_space := false;
203 Soup.children elem |> Soup.iter process_node;
204 Buffer.add_string buffer "*";
205 mark_space_needed ()
206 | "code" ->
207 (* Add space before if needed *)
208 if !need_space then begin
209 match last_char () with
210 | Some (' ' | '\n') -> ()
211 | _ -> Buffer.add_char buffer ' '
212 end;
213 Buffer.add_string buffer "`";
214 need_space := false;
215 Soup.children elem |> Soup.iter process_node;
216 Buffer.add_string buffer "`";
217 mark_space_needed ()
218 | "pre" ->
219 need_space := false;
220 Buffer.add_string buffer "\n```\n";
221 Soup.children elem |> Soup.iter process_node;
222 Buffer.add_string buffer "\n```\n\n";
223 need_space := false
224 | "a" ->
225 let text = Soup.texts elem |> String.concat " " |> String.trim in
226 let href = Soup.attribute "href" elem in
227 (match href with
228 | Some href ->
229 (* Add space before link if needed *)
230 if !need_space then begin
231 match last_char () with
232 | Some (' ' | '\n') -> ()
233 | _ -> Buffer.add_char buffer ' '
234 end;
235 need_space := false;
236
237 (* Add the link markdown *)
238 if text = "" then
239 Buffer.add_string buffer (Printf.sprintf "<%s>" href)
240 else
241 Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
242
243 (* Mark that space may be needed after link *)
244 mark_space_needed ()
245 | None ->
246 add_text text)
247 | "ul" | "ol" ->
248 need_space := false;
249 Buffer.add_string buffer "\n";
250 let is_ordered = tag = "ol" in
251 let items = Soup.children elem |> Soup.to_list in
252 List.iteri (fun i item ->
253 match Soup.element item with
254 | Some li when Soup.name li = "li" ->
255 need_space := false;
256 if is_ordered then
257 Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
258 else
259 Buffer.add_string buffer "- ";
260 Soup.children li |> Soup.iter process_node;
261 Buffer.add_string buffer "\n"
262 | _ -> ()
263 ) items;
264 Buffer.add_string buffer "\n";
265 need_space := false
266 | "blockquote" ->
267 need_space := false;
268 Buffer.add_string buffer "\n> ";
269 Soup.children elem |> Soup.iter process_node;
270 Buffer.add_string buffer "\n\n";
271 need_space := false
272 | "img" ->
273 (* Add space before if needed *)
274 if !need_space then begin
275 match last_char () with
276 | Some (' ' | '\n') -> ()
277 | _ -> Buffer.add_char buffer ' '
278 end;
279 let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
280 let src = Soup.attribute "src" elem |> Option.value ~default:"" in
281 Buffer.add_string buffer (Printf.sprintf "" alt src);
282 need_space := false;
283 mark_space_needed ()
284 | "hr" ->
285 need_space := false;
286 Buffer.add_string buffer "\n---\n\n";
287 need_space := false
288 (* Strip these tags but keep content *)
289 | "div" | "span" | "article" | "section" | "header" | "footer"
290 | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
291 Soup.children elem |> Soup.iter process_node
292 (* Ignore script, style, etc *)
293 | "script" | "style" | "noscript" -> ()
294 (* Default: just process children *)
295 | _ ->
296 Soup.children elem |> Soup.iter process_node)
297 | None ->
298 (* Text node - handle whitespace properly *)
299 match Soup.leaf_text node with
300 | Some text ->
301 (* If text is only whitespace, mark that we need space *)
302 let trimmed = String.trim text in
303 if trimmed = "" then begin
304 if has_whitespace text then
305 need_space := true
306 end else begin
307 (* Text has content - check if it had leading/trailing whitespace *)
308 let had_leading_ws = has_whitespace text &&
309 (String.length text > 0 &&
310 (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
311
312 (* If had leading whitespace, mark we need space *)
313 if had_leading_ws then need_space := true;
314
315 (* Add the text content *)
316 add_text trimmed;
317
318 (* If had trailing whitespace, mark we need space for next *)
319 let had_trailing_ws = has_whitespace text &&
320 (String.length text > 0 &&
321 let last = text.[String.length text - 1] in
322 last = ' ' || last = '\t' || last = '\n' || last = '\r') in
323 if had_trailing_ws then need_space := true
324 end
325 | None -> ()
326 in
327
328 Soup.children soup |> Soup.iter process_node;
329
330 (* Clean up the result *)
331 let result = Buffer.contents buffer in
332 cleanup_markdown result
333 with _ -> html_str
334
335(** Convert HTML content to clean Markdown *)
336let to_markdown html_str =
337 html_to_markdown html_str