My agentic slop goes here. Not intended for anyone else!
1(** HTML to Markdown converter using Lambda Soup *)
2
3(** Extract all links from HTML content *)
4let extract_links html_str =
5 try
6 let soup = Soup.parse html_str in
7 let links = Soup.select "a[href]" soup in
8 Soup.fold (fun acc link ->
9 match Soup.attribute "href" link with
10 | Some href ->
11 let text = Soup.texts link |> String.concat "" |> String.trim in
12 (href, text) :: acc
13 | None -> acc
14 ) [] links
15 |> List.rev
16 with _ -> []
17
18(** Check if string contains any whitespace *)
19let has_whitespace s =
20 try
21 let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
22 true
23 with Not_found -> false
24
25(** Clean up excessive newlines and normalize spacing *)
26let cleanup_markdown s =
27 (* Normalize line endings *)
28 let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
29
30 (* Remove trailing whitespace from each line *)
31 let lines = String.split_on_char '\n' s in
32 let lines = List.map (fun line ->
33 (* Trim trailing spaces but preserve leading spaces for indentation *)
34 let len = String.length line in
35 let rec find_last_non_space i =
36 if i < 0 then -1
37 else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
38 else i
39 in
40 let last = find_last_non_space (len - 1) in
41 if last < 0 then ""
42 else String.sub line 0 (last + 1)
43 ) lines in
44
45 (* Join back and collapse excessive blank lines *)
46 let s = String.concat "\n" lines in
47
48 (* Replace 3+ consecutive newlines with just 2 *)
49 let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
50
51 (* Trim leading and trailing whitespace *)
52 String.trim s
53
54(** Convert HTML to Markdown using state-based whitespace handling *)
55let html_to_markdown html_str =
56 try
57 let soup = Soup.parse html_str in
58 let buffer = Buffer.create 256 in
59
60 (* State: track if we need to insert a space before next text *)
61 let need_space = ref false in
62
63 (* Get last character in buffer, if any *)
64 let last_char () =
65 let len = Buffer.length buffer in
66 if len = 0 then None
67 else Some (Buffer.nth buffer (len - 1))
68 in
69
70 (* Add text with proper spacing *)
71 let add_text text =
72 let trimmed = String.trim text in
73 if trimmed <> "" then begin
74 (* Check if text starts with punctuation that shouldn't have space before it *)
75 let starts_with_punctuation =
76 String.length trimmed > 0 &&
77 (match trimmed.[0] with
78 | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
79 | _ -> false)
80 in
81
82 (* Add space if needed, unless we're before punctuation *)
83 if !need_space && not starts_with_punctuation then begin
84 match last_char () with
85 | Some (' ' | '\n') -> ()
86 | _ -> Buffer.add_char buffer ' '
87 end;
88 Buffer.add_string buffer trimmed;
89 need_space := false
90 end
91 in
92
93 (* Mark that we need space before next text (for inline elements) *)
94 let mark_space_needed () =
95 need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
96 in
97
98 (* Process header with ID/anchor handling *)
99 let process_header level elem =
100 need_space := false;
101
102 (* Check if header contains a link with an ID fragment *)
103 let link_opt = Soup.select_one "a[href]" elem in
104 let anchor_id = match link_opt with
105 | Some link ->
106 (match Soup.attribute "href" link with
107 | Some href ->
108 (* Extract fragment from URL *)
109 let uri = Uri.of_string href in
110 Uri.fragment uri
111 | None -> None)
112 | None -> None
113 in
114
115 (* Add anchor if we found an ID *)
116 (match anchor_id with
117 | Some id when id <> "" ->
118 Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
119 | _ -> ());
120
121 (* Add the header marker *)
122 let marker = String.make level '#' in
123 Buffer.add_string buffer ("\n" ^ marker ^ " ");
124
125 (* Get text content, excluding link tags *)
126 let text = Soup.texts elem |> String.concat " " |> String.trim in
127 Buffer.add_string buffer text;
128
129 Buffer.add_string buffer "\n\n";
130 need_space := false
131 in
132
133 let rec process_node node =
134 match Soup.element node with
135 | Some elem ->
136 let tag = Soup.name elem in
137 (match tag with
138 (* Block elements - reset space tracking *)
139 | "h1" -> process_header 1 elem
140 | "h2" -> process_header 2 elem
141 | "h3" -> process_header 3 elem
142 | "h4" -> process_header 4 elem
143 | "h5" -> process_header 5 elem
144 | "h6" -> process_header 6 elem
145 | "p" ->
146 need_space := false;
147 Soup.children elem |> Soup.iter process_node;
148 Buffer.add_string buffer "\n\n";
149 need_space := false
150 | "br" ->
151 Buffer.add_string buffer "\n";
152 need_space := false
153 (* Inline elements - preserve space tracking *)
154 | "strong" | "b" ->
155 (* Add space before if needed *)
156 if !need_space then begin
157 match last_char () with
158 | Some (' ' | '\n') -> ()
159 | _ -> Buffer.add_char buffer ' '
160 end;
161 Buffer.add_string buffer "**";
162 need_space := false;
163 Soup.children elem |> Soup.iter process_node;
164 Buffer.add_string buffer "**";
165 mark_space_needed ()
166 | "em" | "i" ->
167 (* Add space before if needed *)
168 if !need_space then begin
169 match last_char () with
170 | Some (' ' | '\n') -> ()
171 | _ -> Buffer.add_char buffer ' '
172 end;
173 Buffer.add_string buffer "*";
174 need_space := false;
175 Soup.children elem |> Soup.iter process_node;
176 Buffer.add_string buffer "*";
177 mark_space_needed ()
178 | "code" ->
179 (* Add space before if needed *)
180 if !need_space then begin
181 match last_char () with
182 | Some (' ' | '\n') -> ()
183 | _ -> Buffer.add_char buffer ' '
184 end;
185 Buffer.add_string buffer "`";
186 need_space := false;
187 Soup.children elem |> Soup.iter process_node;
188 Buffer.add_string buffer "`";
189 mark_space_needed ()
190 | "pre" ->
191 need_space := false;
192 Buffer.add_string buffer "\n```\n";
193 Soup.children elem |> Soup.iter process_node;
194 Buffer.add_string buffer "\n```\n\n";
195 need_space := false
196 | "a" ->
197 let text = Soup.texts elem |> String.concat " " |> String.trim in
198 let href = Soup.attribute "href" elem in
199 (match href with
200 | Some href ->
201 (* Add space before link if needed *)
202 if !need_space then begin
203 match last_char () with
204 | Some (' ' | '\n') -> ()
205 | _ -> Buffer.add_char buffer ' '
206 end;
207 need_space := false;
208
209 (* Add the link markdown *)
210 if text = "" then
211 Buffer.add_string buffer (Printf.sprintf "<%s>" href)
212 else
213 Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
214
215 (* Mark that space may be needed after link *)
216 mark_space_needed ()
217 | None ->
218 add_text text)
219 | "ul" | "ol" ->
220 need_space := false;
221 Buffer.add_string buffer "\n";
222 let is_ordered = tag = "ol" in
223 let items = Soup.children elem |> Soup.to_list in
224 List.iteri (fun i item ->
225 match Soup.element item with
226 | Some li when Soup.name li = "li" ->
227 need_space := false;
228 if is_ordered then
229 Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
230 else
231 Buffer.add_string buffer "- ";
232 Soup.children li |> Soup.iter process_node;
233 Buffer.add_string buffer "\n"
234 | _ -> ()
235 ) items;
236 Buffer.add_string buffer "\n";
237 need_space := false
238 | "blockquote" ->
239 need_space := false;
240 Buffer.add_string buffer "\n> ";
241 Soup.children elem |> Soup.iter process_node;
242 Buffer.add_string buffer "\n\n";
243 need_space := false
244 | "img" ->
245 (* Add space before if needed *)
246 if !need_space then begin
247 match last_char () with
248 | Some (' ' | '\n') -> ()
249 | _ -> Buffer.add_char buffer ' '
250 end;
251 let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
252 let src = Soup.attribute "src" elem |> Option.value ~default:"" in
253 Buffer.add_string buffer (Printf.sprintf "" alt src);
254 need_space := false;
255 mark_space_needed ()
256 | "hr" ->
257 need_space := false;
258 Buffer.add_string buffer "\n---\n\n";
259 need_space := false
260 (* Strip these tags but keep content *)
261 | "div" | "span" | "article" | "section" | "header" | "footer"
262 | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
263 Soup.children elem |> Soup.iter process_node
264 (* Ignore script, style, etc *)
265 | "script" | "style" | "noscript" -> ()
266 (* Default: just process children *)
267 | _ ->
268 Soup.children elem |> Soup.iter process_node)
269 | None ->
270 (* Text node - handle whitespace properly *)
271 match Soup.leaf_text node with
272 | Some text ->
273 (* If text is only whitespace, mark that we need space *)
274 let trimmed = String.trim text in
275 if trimmed = "" then begin
276 if has_whitespace text then
277 need_space := true
278 end else begin
279 (* Text has content - check if it had leading/trailing whitespace *)
280 let had_leading_ws = has_whitespace text &&
281 (String.length text > 0 &&
282 (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
283
284 (* If had leading whitespace, mark we need space *)
285 if had_leading_ws then need_space := true;
286
287 (* Add the text content *)
288 add_text trimmed;
289
290 (* If had trailing whitespace, mark we need space for next *)
291 let had_trailing_ws = has_whitespace text &&
292 (String.length text > 0 &&
293 let last = text.[String.length text - 1] in
294 last = ' ' || last = '\t' || last = '\n' || last = '\r') in
295 if had_trailing_ws then need_space := true
296 end
297 | None -> ()
298 in
299
300 Soup.children soup |> Soup.iter process_node;
301
302 (* Clean up the result *)
303 let result = Buffer.contents buffer in
304 cleanup_markdown result
305 with _ -> html_str
306
307(** Convert HTML content to clean Markdown *)
308let to_markdown html_str =
309 html_to_markdown html_str