My agentic slop goes here. Not intended for anyone else!
1(** HTML to Markdown converter using Lambda Soup *) 2 3(** Extract all links from HTML content *) 4let extract_links html_str = 5 try 6 let soup = Soup.parse html_str in 7 let links = Soup.select "a[href]" soup in 8 Soup.fold (fun acc link -> 9 match Soup.attribute "href" link with 10 | Some href -> 11 let text = Soup.texts link |> String.concat "" |> String.trim in 12 (href, text) :: acc 13 | None -> acc 14 ) [] links 15 |> List.rev 16 with _ -> [] 17 18(** Check if string contains any whitespace *) 19let has_whitespace s = 20 try 21 let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in 22 true 23 with Not_found -> false 24 25(** Clean up excessive newlines and normalize spacing *) 26let cleanup_markdown s = 27 (* Normalize line endings *) 28 let s = Str.global_replace (Str.regexp "\r\n") "\n" s in 29 30 (* Remove trailing whitespace from each line *) 31 let lines = String.split_on_char '\n' s in 32 let lines = List.map (fun line -> 33 (* Trim trailing spaces but preserve leading spaces for indentation *) 34 let len = String.length line in 35 let rec find_last_non_space i = 36 if i < 0 then -1 37 else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1) 38 else i 39 in 40 let last = find_last_non_space (len - 1) in 41 if last < 0 then "" 42 else String.sub line 0 (last + 1) 43 ) lines in 44 45 (* Join back and collapse excessive blank lines *) 46 let s = String.concat "\n" lines in 47 48 (* Replace 3+ consecutive newlines with just 2 *) 49 let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in 50 51 (* Trim leading and trailing whitespace *) 52 String.trim s 53 54(** Convert HTML to Markdown using state-based whitespace handling *) 55let html_to_markdown html_str = 56 try 57 let soup = Soup.parse html_str in 58 let buffer = Buffer.create 256 in 59 60 (* State: track if we need to insert a space before next text *) 61 let need_space = ref false in 62 63 (* Get last character in buffer, if any *) 64 let last_char () = 65 let len = Buffer.length buffer in 66 if len = 0 then None 67 else Some (Buffer.nth buffer (len - 1)) 68 in 69 70 (* Add text with proper spacing *) 71 let add_text text = 72 let trimmed = String.trim text in 73 if trimmed <> "" then begin 74 (* Check if text starts with punctuation that shouldn't have space before it *) 75 let starts_with_punctuation = 76 String.length trimmed > 0 && 77 (match trimmed.[0] with 78 | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true 79 | _ -> false) 80 in 81 82 (* Add space if needed, unless we're before punctuation *) 83 if !need_space && not starts_with_punctuation then begin 84 match last_char () with 85 | Some (' ' | '\n') -> () 86 | _ -> Buffer.add_char buffer ' ' 87 end; 88 Buffer.add_string buffer trimmed; 89 need_space := false 90 end 91 in 92 93 (* Mark that we need space before next text (for inline elements) *) 94 let mark_space_needed () = 95 need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0 96 in 97 98 (* Process header with ID/anchor handling *) 99 let process_header level elem = 100 need_space := false; 101 102 (* Check if header contains a link with an ID fragment *) 103 let link_opt = Soup.select_one "a[href]" elem in 104 let anchor_id = match link_opt with 105 | Some link -> 106 (match Soup.attribute "href" link with 107 | Some href -> 108 (* Extract fragment from URL *) 109 let uri = Uri.of_string href in 110 Uri.fragment uri 111 | None -> None) 112 | None -> None 113 in 114 115 (* Add anchor if we found an ID *) 116 (match anchor_id with 117 | Some id when id <> "" -> 118 Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id) 119 | _ -> ()); 120 121 (* Add the header marker *) 122 let marker = String.make level '#' in 123 Buffer.add_string buffer ("\n" ^ marker ^ " "); 124 125 (* Get text content, excluding link tags *) 126 let text = Soup.texts elem |> String.concat " " |> String.trim in 127 Buffer.add_string buffer text; 128 129 Buffer.add_string buffer "\n\n"; 130 need_space := false 131 in 132 133 let rec process_node node = 134 match Soup.element node with 135 | Some elem -> 136 let tag = Soup.name elem in 137 (match tag with 138 (* Block elements - reset space tracking *) 139 | "h1" -> process_header 1 elem 140 | "h2" -> process_header 2 elem 141 | "h3" -> process_header 3 elem 142 | "h4" -> process_header 4 elem 143 | "h5" -> process_header 5 elem 144 | "h6" -> process_header 6 elem 145 | "p" -> 146 need_space := false; 147 Soup.children elem |> Soup.iter process_node; 148 Buffer.add_string buffer "\n\n"; 149 need_space := false 150 | "br" -> 151 Buffer.add_string buffer "\n"; 152 need_space := false 153 (* Inline elements - preserve space tracking *) 154 | "strong" | "b" -> 155 (* Add space before if needed *) 156 if !need_space then begin 157 match last_char () with 158 | Some (' ' | '\n') -> () 159 | _ -> Buffer.add_char buffer ' ' 160 end; 161 Buffer.add_string buffer "**"; 162 need_space := false; 163 Soup.children elem |> Soup.iter process_node; 164 Buffer.add_string buffer "**"; 165 mark_space_needed () 166 | "em" | "i" -> 167 (* Add space before if needed *) 168 if !need_space then begin 169 match last_char () with 170 | Some (' ' | '\n') -> () 171 | _ -> Buffer.add_char buffer ' ' 172 end; 173 Buffer.add_string buffer "*"; 174 need_space := false; 175 Soup.children elem |> Soup.iter process_node; 176 Buffer.add_string buffer "*"; 177 mark_space_needed () 178 | "code" -> 179 (* Add space before if needed *) 180 if !need_space then begin 181 match last_char () with 182 | Some (' ' | '\n') -> () 183 | _ -> Buffer.add_char buffer ' ' 184 end; 185 Buffer.add_string buffer "`"; 186 need_space := false; 187 Soup.children elem |> Soup.iter process_node; 188 Buffer.add_string buffer "`"; 189 mark_space_needed () 190 | "pre" -> 191 need_space := false; 192 Buffer.add_string buffer "\n```\n"; 193 Soup.children elem |> Soup.iter process_node; 194 Buffer.add_string buffer "\n```\n\n"; 195 need_space := false 196 | "a" -> 197 let text = Soup.texts elem |> String.concat " " |> String.trim in 198 let href = Soup.attribute "href" elem in 199 (match href with 200 | Some href -> 201 (* Add space before link if needed *) 202 if !need_space then begin 203 match last_char () with 204 | Some (' ' | '\n') -> () 205 | _ -> Buffer.add_char buffer ' ' 206 end; 207 need_space := false; 208 209 (* Add the link markdown *) 210 if text = "" then 211 Buffer.add_string buffer (Printf.sprintf "<%s>" href) 212 else 213 Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href); 214 215 (* Mark that space may be needed after link *) 216 mark_space_needed () 217 | None -> 218 add_text text) 219 | "ul" | "ol" -> 220 need_space := false; 221 Buffer.add_string buffer "\n"; 222 let is_ordered = tag = "ol" in 223 let items = Soup.children elem |> Soup.to_list in 224 List.iteri (fun i item -> 225 match Soup.element item with 226 | Some li when Soup.name li = "li" -> 227 need_space := false; 228 if is_ordered then 229 Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1)) 230 else 231 Buffer.add_string buffer "- "; 232 Soup.children li |> Soup.iter process_node; 233 Buffer.add_string buffer "\n" 234 | _ -> () 235 ) items; 236 Buffer.add_string buffer "\n"; 237 need_space := false 238 | "blockquote" -> 239 need_space := false; 240 Buffer.add_string buffer "\n> "; 241 Soup.children elem |> Soup.iter process_node; 242 Buffer.add_string buffer "\n\n"; 243 need_space := false 244 | "img" -> 245 (* Add space before if needed *) 246 if !need_space then begin 247 match last_char () with 248 | Some (' ' | '\n') -> () 249 | _ -> Buffer.add_char buffer ' ' 250 end; 251 let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in 252 let src = Soup.attribute "src" elem |> Option.value ~default:"" in 253 Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src); 254 need_space := false; 255 mark_space_needed () 256 | "hr" -> 257 need_space := false; 258 Buffer.add_string buffer "\n---\n\n"; 259 need_space := false 260 (* Strip these tags but keep content *) 261 | "div" | "span" | "article" | "section" | "header" | "footer" 262 | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" -> 263 Soup.children elem |> Soup.iter process_node 264 (* Ignore script, style, etc *) 265 | "script" | "style" | "noscript" -> () 266 (* Default: just process children *) 267 | _ -> 268 Soup.children elem |> Soup.iter process_node) 269 | None -> 270 (* Text node - handle whitespace properly *) 271 match Soup.leaf_text node with 272 | Some text -> 273 (* If text is only whitespace, mark that we need space *) 274 let trimmed = String.trim text in 275 if trimmed = "" then begin 276 if has_whitespace text then 277 need_space := true 278 end else begin 279 (* Text has content - check if it had leading/trailing whitespace *) 280 let had_leading_ws = has_whitespace text && 281 (String.length text > 0 && 282 (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in 283 284 (* If had leading whitespace, mark we need space *) 285 if had_leading_ws then need_space := true; 286 287 (* Add the text content *) 288 add_text trimmed; 289 290 (* If had trailing whitespace, mark we need space for next *) 291 let had_trailing_ws = has_whitespace text && 292 (String.length text > 0 && 293 let last = text.[String.length text - 1] in 294 last = ' ' || last = '\t' || last = '\n' || last = '\r') in 295 if had_trailing_ws then need_space := true 296 end 297 | None -> () 298 in 299 300 Soup.children soup |> Soup.iter process_node; 301 302 (* Clean up the result *) 303 let result = Buffer.contents buffer in 304 cleanup_markdown result 305 with _ -> html_str 306 307(** Convert HTML content to clean Markdown *) 308let to_markdown html_str = 309 html_to_markdown html_str