stack/river/lib/html_markdown.ml at main · anil.recoil.org/slop

anil.recoil.org / slop
My agentic slop goes here. Not intended for anyone else!
slop / stack / river / lib / html_markdown.ml
at main 13 kB view raw
  1(*
  2 * Copyright (c) 2014, OCaml.org project
  3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
  4 *
  5 * Permission to use, copy, modify, and distribute this software for any
  6 * purpose with or without fee is hereby granted, provided that the above
  7 * copyright notice and this permission notice appear in all copies.
  8 *
  9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 16 *)
 17
 18(** Internal utility for HTML to Markdown conversion *)
 19
 20[@@@warning "-32"]  (* Suppress unused value warnings for internal utilities *)
 21
 22(** HTML to Markdown converter using Lambda Soup *)
 23
 24(** Extract all links from HTML content *)
 25let extract_links html_str =
 26  try
 27    let soup = Soup.parse html_str in
 28    let links = Soup.select "a[href]" soup in
 29    Soup.fold (fun acc link ->
 30      match Soup.attribute "href" link with
 31      | Some href ->
 32          (* Filter out local anchors and only include absolute external URLs *)
 33          let uri = Uri.of_string href in
 34          let is_absolute = Uri.scheme uri <> None in
 35          let is_local_anchor = String.starts_with ~prefix:"#" href in
 36          if is_absolute && not is_local_anchor then
 37            let text = Soup.texts link |> String.concat "" |> String.trim in
 38            (href, text) :: acc
 39          else
 40            acc
 41      | None -> acc
 42    ) [] links
 43    |> List.rev
 44  with _ -> []
 45
 46(** Check if string contains any whitespace *)
 47let has_whitespace s =
 48  try
 49    let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
 50    true
 51  with Not_found -> false
 52
 53(** Clean up excessive newlines and normalize spacing *)
 54let cleanup_markdown s =
 55  (* Normalize line endings *)
 56  let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
 57
 58  (* Remove trailing whitespace from each line *)
 59  let lines = String.split_on_char '\n' s in
 60  let lines = List.map (fun line ->
 61    (* Trim trailing spaces but preserve leading spaces for indentation *)
 62    let len = String.length line in
 63    let rec find_last_non_space i =
 64      if i < 0 then -1
 65      else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
 66      else i
 67    in
 68    let last = find_last_non_space (len - 1) in
 69    if last < 0 then ""
 70    else String.sub line 0 (last + 1)
 71  ) lines in
 72
 73  (* Join back and collapse excessive blank lines *)
 74  let s = String.concat "\n" lines in
 75
 76  (* Replace 3+ consecutive newlines with just 2 *)
 77  let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
 78
 79  (* Trim leading and trailing whitespace *)
 80  String.trim s
 81
 82(** Convert HTML to Markdown using state-based whitespace handling *)
 83let html_to_markdown html_str =
 84  try
 85    let soup = Soup.parse html_str in
 86    let buffer = Buffer.create 256 in
 87
 88    (* State: track if we need to insert a space before next text *)
 89    let need_space = ref false in
 90
 91    (* Get last character in buffer, if any *)
 92    let last_char () =
 93      let len = Buffer.length buffer in
 94      if len = 0 then None
 95      else Some (Buffer.nth buffer (len - 1))
 96    in
 97
 98    (* Add text with proper spacing *)
 99    let add_text text =
100      let trimmed = String.trim text in
101      if trimmed <> "" then begin
102        (* Check if text starts with punctuation that shouldn't have space before it *)
103        let starts_with_punctuation =
104          String.length trimmed > 0 &&
105          (match trimmed.[0] with
106          | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
107          | _ -> false)
108        in
109
110        (* Add space if needed, unless we're before punctuation *)
111        if !need_space && not starts_with_punctuation then begin
112          match last_char () with
113          | Some (' ' | '\n') -> ()
114          | _ -> Buffer.add_char buffer ' '
115        end;
116        Buffer.add_string buffer trimmed;
117        need_space := false
118      end
119    in
120
121    (* Mark that we need space before next text (for inline elements) *)
122    let mark_space_needed () =
123      need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
124    in
125
126    (* Process header with ID/anchor handling *)
127    let process_header level elem =
128      need_space := false;
129
130      (* Check if header contains a link with an ID fragment *)
131      let link_opt = Soup.select_one "a[href]" elem in
132      let anchor_id = match link_opt with
133        | Some link ->
134            (match Soup.attribute "href" link with
135            | Some href ->
136                (* Extract fragment from URL *)
137                let uri = Uri.of_string href in
138                Uri.fragment uri
139            | None -> None)
140        | None -> None
141      in
142
143      (* Add anchor if we found an ID *)
144      (match anchor_id with
145      | Some id when id <> "" ->
146          Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
147      | _ -> ());
148
149      (* Add the header marker *)
150      let marker = String.make level '#' in
151      Buffer.add_string buffer ("\n" ^ marker ^ " ");
152
153      (* Get text content, excluding link tags *)
154      let text = Soup.texts elem |> String.concat " " |> String.trim in
155      Buffer.add_string buffer text;
156
157      Buffer.add_string buffer "\n\n";
158      need_space := false
159    in
160
161    let rec process_node node =
162      match Soup.element node with
163      | Some elem ->
164          let tag = Soup.name elem in
165          (match tag with
166          (* Block elements - reset space tracking *)
167          | "h1" -> process_header 1 elem
168          | "h2" -> process_header 2 elem
169          | "h3" -> process_header 3 elem
170          | "h4" -> process_header 4 elem
171          | "h5" -> process_header 5 elem
172          | "h6" -> process_header 6 elem
173          | "p" ->
174              need_space := false;
175              Soup.children elem |> Soup.iter process_node;
176              Buffer.add_string buffer "\n\n";
177              need_space := false
178          | "br" ->
179              Buffer.add_string buffer "\n";
180              need_space := false
181          (* Inline elements - preserve space tracking *)
182          | "strong" | "b" ->
183              (* Add space before if needed *)
184              if !need_space then begin
185                match last_char () with
186                | Some (' ' | '\n') -> ()
187                | _ -> Buffer.add_char buffer ' '
188              end;
189              Buffer.add_string buffer "**";
190              need_space := false;
191              Soup.children elem |> Soup.iter process_node;
192              Buffer.add_string buffer "**";
193              mark_space_needed ()
194          | "em" | "i" ->
195              (* Add space before if needed *)
196              if !need_space then begin
197                match last_char () with
198                | Some (' ' | '\n') -> ()
199                | _ -> Buffer.add_char buffer ' '
200              end;
201              Buffer.add_string buffer "*";
202              need_space := false;
203              Soup.children elem |> Soup.iter process_node;
204              Buffer.add_string buffer "*";
205              mark_space_needed ()
206          | "code" ->
207              (* Add space before if needed *)
208              if !need_space then begin
209                match last_char () with
210                | Some (' ' | '\n') -> ()
211                | _ -> Buffer.add_char buffer ' '
212              end;
213              Buffer.add_string buffer "`";
214              need_space := false;
215              Soup.children elem |> Soup.iter process_node;
216              Buffer.add_string buffer "`";
217              mark_space_needed ()
218          | "pre" ->
219              need_space := false;
220              Buffer.add_string buffer "\n```\n";
221              Soup.children elem |> Soup.iter process_node;
222              Buffer.add_string buffer "\n```\n\n";
223              need_space := false
224          | "a" ->
225              let text = Soup.texts elem |> String.concat " " |> String.trim in
226              let href = Soup.attribute "href" elem in
227              (match href with
228              | Some href ->
229                  (* Add space before link if needed *)
230                  if !need_space then begin
231                    match last_char () with
232                    | Some (' ' | '\n') -> ()
233                    | _ -> Buffer.add_char buffer ' '
234                  end;
235                  need_space := false;
236
237                  (* Add the link markdown *)
238                  if text = "" then
239                    Buffer.add_string buffer (Printf.sprintf "<%s>" href)
240                  else
241                    Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
242
243                  (* Mark that space may be needed after link *)
244                  mark_space_needed ()
245              | None ->
246                  add_text text)
247          | "ul" | "ol" ->
248              need_space := false;
249              Buffer.add_string buffer "\n";
250              let is_ordered = tag = "ol" in
251              let items = Soup.children elem |> Soup.to_list in
252              List.iteri (fun i item ->
253                match Soup.element item with
254                | Some li when Soup.name li = "li" ->
255                    need_space := false;
256                    if is_ordered then
257                      Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
258                    else
259                      Buffer.add_string buffer "- ";
260                    Soup.children li |> Soup.iter process_node;
261                    Buffer.add_string buffer "\n"
262                | _ -> ()
263              ) items;
264              Buffer.add_string buffer "\n";
265              need_space := false
266          | "blockquote" ->
267              need_space := false;
268              Buffer.add_string buffer "\n> ";
269              Soup.children elem |> Soup.iter process_node;
270              Buffer.add_string buffer "\n\n";
271              need_space := false
272          | "img" ->
273              (* Add space before if needed *)
274              if !need_space then begin
275                match last_char () with
276                | Some (' ' | '\n') -> ()
277                | _ -> Buffer.add_char buffer ' '
278              end;
279              let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
280              let src = Soup.attribute "src" elem |> Option.value ~default:"" in
281              Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src);
282              need_space := false;
283              mark_space_needed ()
284          | "hr" ->
285              need_space := false;
286              Buffer.add_string buffer "\n---\n\n";
287              need_space := false
288          (* Strip these tags but keep content *)
289          | "div" | "span" | "article" | "section" | "header" | "footer"
290          | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
291              Soup.children elem |> Soup.iter process_node
292          (* Ignore script, style, etc *)
293          | "script" | "style" | "noscript" -> ()
294          (* Default: just process children *)
295          | _ ->
296              Soup.children elem |> Soup.iter process_node)
297      | None ->
298          (* Text node - handle whitespace properly *)
299          match Soup.leaf_text node with
300          | Some text ->
301              (* If text is only whitespace, mark that we need space *)
302              let trimmed = String.trim text in
303              if trimmed = "" then begin
304                if has_whitespace text then
305                  need_space := true
306              end else begin
307                (* Text has content - check if it had leading/trailing whitespace *)
308                let had_leading_ws = has_whitespace text &&
309                  (String.length text > 0 &&
310                   (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
311
312                (* If had leading whitespace, mark we need space *)
313                if had_leading_ws then need_space := true;
314
315                (* Add the text content *)
316                add_text trimmed;
317
318                (* If had trailing whitespace, mark we need space for next *)
319                let had_trailing_ws = has_whitespace text &&
320                  (String.length text > 0 &&
321                   let last = text.[String.length text - 1] in
322                   last = ' ' || last = '\t' || last = '\n' || last = '\r') in
323                if had_trailing_ws then need_space := true
324              end
325          | None -> ()
326    in
327
328    Soup.children soup |> Soup.iter process_node;
329
330    (* Clean up the result *)
331    let result = Buffer.contents buffer in
332    cleanup_markdown result
333  with _ -> html_str
334
335(** Convert HTML content to clean Markdown *)
336let to_markdown html_str =
337  html_to_markdown html_str