My agentic slop goes here. Not intended for anyone else!
at main 13 kB view raw
1(* 2 * Copyright (c) 2014, OCaml.org project 3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 *) 17 18(** Internal utility for HTML to Markdown conversion *) 19 20[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *) 21 22(** HTML to Markdown converter using Lambda Soup *) 23 24(** Extract all links from HTML content *) 25let extract_links html_str = 26 try 27 let soup = Soup.parse html_str in 28 let links = Soup.select "a[href]" soup in 29 Soup.fold (fun acc link -> 30 match Soup.attribute "href" link with 31 | Some href -> 32 (* Filter out local anchors and only include absolute external URLs *) 33 let uri = Uri.of_string href in 34 let is_absolute = Uri.scheme uri <> None in 35 let is_local_anchor = String.starts_with ~prefix:"#" href in 36 if is_absolute && not is_local_anchor then 37 let text = Soup.texts link |> String.concat "" |> String.trim in 38 (href, text) :: acc 39 else 40 acc 41 | None -> acc 42 ) [] links 43 |> List.rev 44 with _ -> [] 45 46(** Check if string contains any whitespace *) 47let has_whitespace s = 48 try 49 let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in 50 true 51 with Not_found -> false 52 53(** Clean up excessive newlines and normalize spacing *) 54let cleanup_markdown s = 55 (* Normalize line endings *) 56 let s = Str.global_replace (Str.regexp "\r\n") "\n" s in 57 58 (* Remove trailing whitespace from each line *) 59 let lines = String.split_on_char '\n' s in 60 let lines = List.map (fun line -> 61 (* Trim trailing spaces but preserve leading spaces for indentation *) 62 let len = String.length line in 63 let rec find_last_non_space i = 64 if i < 0 then -1 65 else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1) 66 else i 67 in 68 let last = find_last_non_space (len - 1) in 69 if last < 0 then "" 70 else String.sub line 0 (last + 1) 71 ) lines in 72 73 (* Join back and collapse excessive blank lines *) 74 let s = String.concat "\n" lines in 75 76 (* Replace 3+ consecutive newlines with just 2 *) 77 let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in 78 79 (* Trim leading and trailing whitespace *) 80 String.trim s 81 82(** Convert HTML to Markdown using state-based whitespace handling *) 83let html_to_markdown html_str = 84 try 85 let soup = Soup.parse html_str in 86 let buffer = Buffer.create 256 in 87 88 (* State: track if we need to insert a space before next text *) 89 let need_space = ref false in 90 91 (* Get last character in buffer, if any *) 92 let last_char () = 93 let len = Buffer.length buffer in 94 if len = 0 then None 95 else Some (Buffer.nth buffer (len - 1)) 96 in 97 98 (* Add text with proper spacing *) 99 let add_text text = 100 let trimmed = String.trim text in 101 if trimmed <> "" then begin 102 (* Check if text starts with punctuation that shouldn't have space before it *) 103 let starts_with_punctuation = 104 String.length trimmed > 0 && 105 (match trimmed.[0] with 106 | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true 107 | _ -> false) 108 in 109 110 (* Add space if needed, unless we're before punctuation *) 111 if !need_space && not starts_with_punctuation then begin 112 match last_char () with 113 | Some (' ' | '\n') -> () 114 | _ -> Buffer.add_char buffer ' ' 115 end; 116 Buffer.add_string buffer trimmed; 117 need_space := false 118 end 119 in 120 121 (* Mark that we need space before next text (for inline elements) *) 122 let mark_space_needed () = 123 need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0 124 in 125 126 (* Process header with ID/anchor handling *) 127 let process_header level elem = 128 need_space := false; 129 130 (* Check if header contains a link with an ID fragment *) 131 let link_opt = Soup.select_one "a[href]" elem in 132 let anchor_id = match link_opt with 133 | Some link -> 134 (match Soup.attribute "href" link with 135 | Some href -> 136 (* Extract fragment from URL *) 137 let uri = Uri.of_string href in 138 Uri.fragment uri 139 | None -> None) 140 | None -> None 141 in 142 143 (* Add anchor if we found an ID *) 144 (match anchor_id with 145 | Some id when id <> "" -> 146 Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id) 147 | _ -> ()); 148 149 (* Add the header marker *) 150 let marker = String.make level '#' in 151 Buffer.add_string buffer ("\n" ^ marker ^ " "); 152 153 (* Get text content, excluding link tags *) 154 let text = Soup.texts elem |> String.concat " " |> String.trim in 155 Buffer.add_string buffer text; 156 157 Buffer.add_string buffer "\n\n"; 158 need_space := false 159 in 160 161 let rec process_node node = 162 match Soup.element node with 163 | Some elem -> 164 let tag = Soup.name elem in 165 (match tag with 166 (* Block elements - reset space tracking *) 167 | "h1" -> process_header 1 elem 168 | "h2" -> process_header 2 elem 169 | "h3" -> process_header 3 elem 170 | "h4" -> process_header 4 elem 171 | "h5" -> process_header 5 elem 172 | "h6" -> process_header 6 elem 173 | "p" -> 174 need_space := false; 175 Soup.children elem |> Soup.iter process_node; 176 Buffer.add_string buffer "\n\n"; 177 need_space := false 178 | "br" -> 179 Buffer.add_string buffer "\n"; 180 need_space := false 181 (* Inline elements - preserve space tracking *) 182 | "strong" | "b" -> 183 (* Add space before if needed *) 184 if !need_space then begin 185 match last_char () with 186 | Some (' ' | '\n') -> () 187 | _ -> Buffer.add_char buffer ' ' 188 end; 189 Buffer.add_string buffer "**"; 190 need_space := false; 191 Soup.children elem |> Soup.iter process_node; 192 Buffer.add_string buffer "**"; 193 mark_space_needed () 194 | "em" | "i" -> 195 (* Add space before if needed *) 196 if !need_space then begin 197 match last_char () with 198 | Some (' ' | '\n') -> () 199 | _ -> Buffer.add_char buffer ' ' 200 end; 201 Buffer.add_string buffer "*"; 202 need_space := false; 203 Soup.children elem |> Soup.iter process_node; 204 Buffer.add_string buffer "*"; 205 mark_space_needed () 206 | "code" -> 207 (* Add space before if needed *) 208 if !need_space then begin 209 match last_char () with 210 | Some (' ' | '\n') -> () 211 | _ -> Buffer.add_char buffer ' ' 212 end; 213 Buffer.add_string buffer "`"; 214 need_space := false; 215 Soup.children elem |> Soup.iter process_node; 216 Buffer.add_string buffer "`"; 217 mark_space_needed () 218 | "pre" -> 219 need_space := false; 220 Buffer.add_string buffer "\n```\n"; 221 Soup.children elem |> Soup.iter process_node; 222 Buffer.add_string buffer "\n```\n\n"; 223 need_space := false 224 | "a" -> 225 let text = Soup.texts elem |> String.concat " " |> String.trim in 226 let href = Soup.attribute "href" elem in 227 (match href with 228 | Some href -> 229 (* Add space before link if needed *) 230 if !need_space then begin 231 match last_char () with 232 | Some (' ' | '\n') -> () 233 | _ -> Buffer.add_char buffer ' ' 234 end; 235 need_space := false; 236 237 (* Add the link markdown *) 238 if text = "" then 239 Buffer.add_string buffer (Printf.sprintf "<%s>" href) 240 else 241 Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href); 242 243 (* Mark that space may be needed after link *) 244 mark_space_needed () 245 | None -> 246 add_text text) 247 | "ul" | "ol" -> 248 need_space := false; 249 Buffer.add_string buffer "\n"; 250 let is_ordered = tag = "ol" in 251 let items = Soup.children elem |> Soup.to_list in 252 List.iteri (fun i item -> 253 match Soup.element item with 254 | Some li when Soup.name li = "li" -> 255 need_space := false; 256 if is_ordered then 257 Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1)) 258 else 259 Buffer.add_string buffer "- "; 260 Soup.children li |> Soup.iter process_node; 261 Buffer.add_string buffer "\n" 262 | _ -> () 263 ) items; 264 Buffer.add_string buffer "\n"; 265 need_space := false 266 | "blockquote" -> 267 need_space := false; 268 Buffer.add_string buffer "\n> "; 269 Soup.children elem |> Soup.iter process_node; 270 Buffer.add_string buffer "\n\n"; 271 need_space := false 272 | "img" -> 273 (* Add space before if needed *) 274 if !need_space then begin 275 match last_char () with 276 | Some (' ' | '\n') -> () 277 | _ -> Buffer.add_char buffer ' ' 278 end; 279 let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in 280 let src = Soup.attribute "src" elem |> Option.value ~default:"" in 281 Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src); 282 need_space := false; 283 mark_space_needed () 284 | "hr" -> 285 need_space := false; 286 Buffer.add_string buffer "\n---\n\n"; 287 need_space := false 288 (* Strip these tags but keep content *) 289 | "div" | "span" | "article" | "section" | "header" | "footer" 290 | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" -> 291 Soup.children elem |> Soup.iter process_node 292 (* Ignore script, style, etc *) 293 | "script" | "style" | "noscript" -> () 294 (* Default: just process children *) 295 | _ -> 296 Soup.children elem |> Soup.iter process_node) 297 | None -> 298 (* Text node - handle whitespace properly *) 299 match Soup.leaf_text node with 300 | Some text -> 301 (* If text is only whitespace, mark that we need space *) 302 let trimmed = String.trim text in 303 if trimmed = "" then begin 304 if has_whitespace text then 305 need_space := true 306 end else begin 307 (* Text has content - check if it had leading/trailing whitespace *) 308 let had_leading_ws = has_whitespace text && 309 (String.length text > 0 && 310 (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in 311 312 (* If had leading whitespace, mark we need space *) 313 if had_leading_ws then need_space := true; 314 315 (* Add the text content *) 316 add_text trimmed; 317 318 (* If had trailing whitespace, mark we need space for next *) 319 let had_trailing_ws = has_whitespace text && 320 (String.length text > 0 && 321 let last = text.[String.length text - 1] in 322 last = ' ' || last = '\t' || last = '\n' || last = '\r') in 323 if had_trailing_ws then need_space := true 324 end 325 | None -> () 326 in 327 328 Soup.children soup |> Soup.iter process_node; 329 330 (* Clean up the result *) 331 let result = Buffer.contents buffer in 332 cleanup_markdown result 333 with _ -> html_str 334 335(** Convert HTML content to clean Markdown *) 336let to_markdown html_str = 337 html_to_markdown html_str