commit a5e648ac42c4cb5f9bef33a90566988a9831af94 · anil.recoil.org/odoc-mcp

CLAUDE.md

···

       1
       1
       +
       I wish to turn JSON files output by odoc-driver (an OCaml documentation generator) into succinct Markdown that is a     │

     

       2
       2
       +
       good input to a coding model such as you. Look at                                                                       │

     

       3
       3
       +
       _html/mirage-crypto/mirage-crypto/Mirage_crypto/DES/CTR/index.html.json as one such example, with more being in _html/  │

     

       4
       4
       +
       but be aware there are thousands of files. Write me a odoc2llm.py Python script that uses Beautiful Soup and JSON       │

     

       5
       5
       +
       parsing to crunch up just the relevant signatures and crosslinks into a _single_ markdown file from the _html           │

     

       6
       6
       +
       directory

+471

odoc2llm.py

···

       1
       1
       +
       #!/usr/bin/env python3

     

       2
       2
       +
       # /// script

     

       3
       3
       +
       # requires-python = ">=3.11"

     

       4
       4
       +
       # dependencies = [

     

       5
       5
       +
       #   "bs4",

     

       6
       6
       +
       # ]

     

       7
       7
       +
       # ///

     

       8
       8
       +
       """

     

       9
       9
       +
       odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs

     

       10
       10
       +
       

     

       11
       11
       +
       This script processes JSON files generated by odoc-driver (OCaml documentation generator)

     

       12
       12
       +
       and produces a single Markdown file with the essential module structure and signatures

     

       13
       13
       +
       formatted in a way that makes it useful for LLMs to reason about OCaml codebases.

     

       14
       14
       +
       """

     

       15
       15
       +
       

     

       16
       16
       +
       import os

     

       17
       17
       +
       import sys

     

       18
       18
       +
       import json

     

       19
       19
       +
       import re

     

       20
       20
       +
       from bs4 import BeautifulSoup

     

       21
       21
       +
       from collections import defaultdict

     

       22
       22
       +
       import argparse

     

       23
       23
       +
       from pathlib import Path

     

       24
       24
       +
       import html

     

       25
       25
       +
       

     

       26
       26
       +
       

     

       27
       27
       +
       def extract_module_info(json_content):

     

       28
       28
       +
           """Extract module information from odoc JSON content."""

     

       29
       29
       +
           data = json.loads(json_content)

     

       30
       30
       +
           

     

       31
       31
       +
           # Extract module name and type from header

     

       32
       32
       +
           header = data.get("header", "")

     

       33
       33
       +
           soup = BeautifulSoup(header, "html.parser")

     

       34
       34
       +
           header_text = soup.get_text().strip()

     

       35
       35
       +
           

     

       36
       36
       +
           # Determine module type and name

     

       37
       37
       +
           module_type = "Module"

     

       38
       38
       +
           if "Module type" in header_text:

     

       39
       39
       +
               module_type = "Module type"

     

       40
       40
       +
           elif "Class" in header_text:

     

       41
       41
       +
               module_type = "Class"

     

       42
       42
       +
           

     

       43
       43
       +
           # Extract the actual module name

     

       44
       44
       +
           module_name = ""

     

       45
       45
       +
           code_tag = soup.find("code")

     

       46
       46
       +
           if code_tag:

     

       47
       47
       +
               module_name = code_tag.get_text().strip()

     

       48
       48
       +
           else:

     

       49
       49
       +
               # Fall back to header text with type prefix removed

     

       50
       50
       +
               module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)

     

       51
       51
       +
           

     

       52
       52
       +
           # Extract breadcrumbs for context

     

       53
       53
       +
           breadcrumbs = []

     

       54
       54
       +
           for crumb in data.get("breadcrumbs", []):

     

       55
       55
       +
               name = crumb.get("name", "")

     

       56
       56
       +
               if name:

     

       57
       57
       +
                   soup = BeautifulSoup(name, "html.parser")

     

       58
       58
       +
                   clean_name = soup.get_text().strip()

     

       59
       59
       +
                   # Clean up the breadcrumb text

     

       60
       60
       +
                   clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)

     

       61
       61
       +
                   breadcrumbs.append(clean_name)

     

       62
       62
       +
           

     

       63
       63
       +
           # Extract module content

     

       64
       64
       +
           content = data.get("content", "")

     

       65
       65
       +
           soup = BeautifulSoup(content, "html.parser")

     

       66
       66
       +
           

     

       67
       67
       +
           return {

     

       68
       68
       +
               "name": module_name,

     

       69
       69
       +
               "type": module_type,

     

       70
       70
       +
               "breadcrumbs": breadcrumbs,

     

       71
       71
       +
               "content": soup,

     

       72
       72
       +
               "preamble": data.get("preamble", "")

     

       73
       73
       +
           }

     

       74
       74
       +
       

     

       75
       75
       +
       

     

       76
       76
       +
       def clean_signature_text(text):

     

       77
       77
       +
           """Clean up signature text for better readability."""

     

       78
       78
       +
           # Replace special arrow characters with ->

     

       79
       79
       +
           text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->')

     

       80
       80
       +
           

     

       81
       81
       +
           # Replace multiple spaces with a single space, except in code blocks

     

       82
       82
       +
           text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)

     

       83
       83
       +
           

     

       84
       84
       +
           return text

     

       85
       85
       +
       

     

       86
       86
       +
       

     

       87
       87
       +
       def extract_signature_name(sig_content):

     

       88
       88
       +
           """Extract the name of a signature (function name, type name, etc.)."""

     

       89
       89
       +
           # For val signatures: extract function name before the first :

     

       90
       90
       +
           match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)

     

       91
       91
       +
           if match:

     

       92
       92
       +
               return match.group(1)

     

       93
       93
       +
           

     

       94
       94
       +
           # For type signatures: extract type name

     

       95
       95
       +
           match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)

     

       96
       96
       +
           if match:

     

       97
       97
       +
               return match.group(1)

     

       98
       98
       +
           

     

       99
       99
       +
           # For module signatures: extract module name

     

       100
       100
       +
           match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)

     

       101
       101
       +
           if match:

     

       102
       102
       +
               return match.group(1)

     

       103
       103
       +
           

     

       104
       104
       +
           # For class signatures: extract class name

     

       105
       105
       +
           match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)

     

       106
       106
       +
           if match:

     

       107
       107
       +
               return match.group(1)

     

       108
       108
       +
           

     

       109
       109
       +
           # For exception signatures: extract exception name

     

       110
       110
       +
           match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)

     

       111
       111
       +
           if match:

     

       112
       112
       +
               return match.group(1)

     

       113
       113
       +
           

     

       114
       114
       +
           return None

     

       115
       115
       +
       

     

       116
       116
       +
       

     

       117
       117
       +
       def parse_module_signature(content_soup):

     

       118
       118
       +
           """Parse the OCaml module signature from the HTML content."""

     

       119
       119
       +
           signatures = []

     

       120
       120
       +
           

     

       121
       121
       +
           # Get all the odoc-spec divs

     

       122
       122
       +
           spec_divs = content_soup.find_all("div", class_="odoc-spec")

     

       123
       123
       +
           

     

       124
       124
       +
           for spec in spec_divs:

     

       125
       125
       +
               sig_id = None

     

       126
       126
       +
               sig_type = None

     

       127
       127
       +
               sig_content = None

     

       128
       128
       +
               doc_content = None

     

       129
       129
       +
               

     

       130
       130
       +
               # Find the actual signature

     

       131
       131
       +
               sig_div = spec.find("div", class_="spec")

     

       132
       132
       +
               if sig_div:

     

       133
       133
       +
                   # Get the ID for cross-referencing

     

       134
       134
       +
                   sig_id = sig_div.get("id", "")

     

       135
       135
       +
                   

     

       136
       136
       +
                   # Determine the type of signature (type, val, module, etc.)

     

       137
       137
       +
                   sig_type_span = sig_div.find("span", class_="keyword")

     

       138
       138
       +
                   if sig_type_span:

     

       139
       139
       +
                       sig_type = sig_type_span.get_text().strip()

     

       140
       140
       +
                   

     

       141
       141
       +
                   # Get the full code content

     

       142
       142
       +
                   code_tag = sig_div.find("code")

     

       143
       143
       +
                   if code_tag:

     

       144
       144
       +
                       # Extract the full OCaml signature text properly

     

       145
       145
       +
                       # We'll convert all spans to plain text while preserving structure

     

       146
       146
       +
                       for span in code_tag.find_all("span"):

     

       147
       147
       +
                           span.replace_with(span.get_text())

     

       148
       148
       +
                       

     

       149
       149
       +
                       sig_content = clean_signature_text(code_tag.get_text())

     

       150
       150
       +
               

     

       151
       151
       +
               # Find documentation for this signature

     

       152
       152
       +
               doc_div = spec.find("div", class_="spec-doc")

     

       153
       153
       +
               if doc_div:

     

       154
       154
       +
                   # Process paragraphs and lists for documentation

     

       155
       155
       +
                   doc_parts = []

     

       156
       156
       +
                   

     

       157
       157
       +
                   # Process regular paragraphs

     

       158
       158
       +
                   for p in doc_div.find_all("p"):

     

       159
       159
       +
                       # Clean up code references in paragraph

     

       160
       160
       +
                       for code in p.find_all("code"):

     

       161
       161
       +
                           # Convert links within code tags to plain text

     

       162
       162
       +
                           for a in code.find_all("a"):

     

       163
       163
       +
                               a.replace_with(a.get_text())

     

       164
       164
       +
                           # Keep the code tag formatting

     

       165
       165
       +
                           code_text = code.get_text()

     

       166
       166
       +
                           code.string = code_text

     

       167
       167
       +
                       

     

       168
       168
       +
                       # Clean up the paragraph text

     

       169
       169
       +
                       p_text = clean_signature_text(p.get_text()).strip()

     

       170
       170
       +
                       if p_text:

     

       171
       171
       +
                           doc_parts.append(p_text)

     

       172
       172
       +
                   

     

       173
       173
       +
                   # Process bulleted lists

     

       174
       174
       +
                   for ul in doc_div.find_all("ul"):

     

       175
       175
       +
                       for li in ul.find_all("li"):

     

       176
       176
       +
                           # Check if it's a special tag like @raises, @returns, etc.

     

       177
       177
       +
                           tag_span = li.find("span", class_="at-tag")

     

       178
       178
       +
                           if tag_span:

     

       179
       179
       +
                               tag_name = tag_span.get_text().strip()

     

       180
       180
       +
                               # Remove the tag span from consideration

     

       181
       181
       +
                               tag_span.extract()

     

       182
       182
       +
                               # Get the rest of the content

     

       183
       183
       +
                               li_text = clean_signature_text(li.get_text()).strip()

     

       184
       184
       +
                               doc_parts.append(f"@{tag_name} {li_text}")

     

       185
       185
       +
                           else:

     

       186
       186
       +
                               # Regular list item

     

       187
       187
       +
                               li_text = clean_signature_text(li.get_text()).strip()

     

       188
       188
       +
                               doc_parts.append(f"- {li_text}")

     

       189
       189
       +
                   

     

       190
       190
       +
                   # Process code examples

     

       191
       191
       +
                   for pre in doc_div.find_all("pre"):

     

       192
       192
       +
                       code = pre.find("code")

     

       193
       193
       +
                       if code:

     

       194
       194
       +
                           # Get the language class if available

     

       195
       195
       +
                           lang = "ocaml"  # Default to OCaml

     

       196
       196
       +
                           if "language-" in code.get("class", [""]):

     

       197
       197
       +
                               for cls in code.get("class", []):

     

       198
       198
       +
                                   if cls.startswith("language-"):

     

       199
       199
       +
                                       lang = cls.replace("language-", "")

     

       200
       200
       +
                           

     

       201
       201
       +
                           # Preserve indentation and line breaks in code blocks

     

       202
       202
       +
                           code_text = code.get_text()

     

       203
       203
       +
                           doc_parts.append(f"```{lang}\n{code_text}\n```")

     

       204
       204
       +
                   

     

       205
       205
       +
                   if doc_parts:

     

       206
       206
       +
                       doc_content = "\n".join(doc_parts)

     

       207
       207
       +
               

     

       208
       208
       +
               # Only add signatures that have content

     

       209
       209
       +
               if sig_type and sig_content:

     

       210
       210
       +
                   # Extract the name of the element (function name, type name, etc.)

     

       211
       211
       +
                   name = extract_signature_name(sig_content)

     

       212
       212
       +
                   

     

       213
       213
       +
                   # Build the full signature

     

       214
       214
       +
                   signature = {

     

       215
       215
       +
                       "id": sig_id,

     

       216
       216
       +
                       "type": sig_type,

     

       217
       217
       +
                       "name": name,

     

       218
       218
       +
                       "content": sig_content,

     

       219
       219
       +
                       "doc": doc_content

     

       220
       220
       +
                   }

     

       221
       221
       +
                   signatures.append(signature)

     

       222
       222
       +
           

     

       223
       223
       +
           return signatures

     

       224
       224
       +
       

     

       225
       225
       +
       

     

       226
       226
       +
       def generate_markdown(module_info, signatures):

     

       227
       227
       +
           """Generate markdown documentation from parsed module information."""

     

       228
       228
       +
           md_lines = []

     

       229
       229
       +
           

     

       230
       230
       +
           # Module header with breadcrumbs

     

       231
       231
       +
           breadcrumb_path = " > ".join(module_info["breadcrumbs"])

     

       232
       232
       +
           md_lines.append(f"# {module_info['type']} `{module_info['name']}`")

     

       233
       233
       +
           md_lines.append(f"**Path:** {breadcrumb_path}")

     

       234
       234
       +
           md_lines.append("")

     

       235
       235
       +
           

     

       236
       236
       +
           # Add module preamble documentation if available

     

       237
       237
       +
           if module_info["preamble"]:

     

       238
       238
       +
               preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")

     

       239
       239
       +
               preamble_text = clean_signature_text(preamble_soup.get_text()).strip()

     

       240
       240
       +
               if preamble_text:

     

       241
       241
       +
                   md_lines.append(preamble_text)

     

       242
       242
       +
                   md_lines.append("")

     

       243
       243
       +
           

     

       244
       244
       +
           # Organize signatures by type

     

       245
       245
       +
           sig_by_type = defaultdict(list)

     

       246
       246
       +
           for sig in signatures:

     

       247
       247
       +
               sig_by_type[sig["type"]].append(sig)

     

       248
       248
       +
           

     

       249
       249
       +
           # Process types first

     

       250
       250
       +
           if "type" in sig_by_type:

     

       251
       251
       +
               md_lines.append("## Types")

     

       252
       252
       +
               for sig in sig_by_type["type"]:

     

       253
       253
       +
                   md_lines.append("")

     

       254
       254
       +
                   md_lines.append(f"### `{sig['content']}`")

     

       255
       255
       +
                   

     

       256
       256
       +
                   # Add documentation if available

     

       257
       257
       +
                   if sig["doc"]:

     

       258
       258
       +
                       md_lines.append("")

     

       259
       259
       +
                       md_lines.append(sig["doc"])

     

       260
       260
       +
               md_lines.append("")

     

       261
       261
       +
           

     

       262
       262
       +
           # Process exceptions

     

       263
       263
       +
           if "exception" in sig_by_type:

     

       264
       264
       +
               md_lines.append("## Exceptions")

     

       265
       265
       +
               for sig in sig_by_type["exception"]:

     

       266
       266
       +
                   md_lines.append("")

     

       267
       267
       +
                   md_lines.append(f"### `{sig['content']}`")

     

       268
       268
       +
                   

     

       269
       269
       +
                   # Add documentation if available

     

       270
       270
       +
                   if sig["doc"]:

     

       271
       271
       +
                       md_lines.append("")

     

       272
       272
       +
                       md_lines.append(sig["doc"])

     

       273
       273
       +
               md_lines.append("")

     

       274
       274
       +
           

     

       275
       275
       +
           # Process values (functions)

     

       276
       276
       +
           if "val" in sig_by_type:

     

       277
       277
       +
               md_lines.append("## Values")

     

       278
       278
       +
               for sig in sig_by_type["val"]:

     

       279
       279
       +
                   md_lines.append("")

     

       280
       280
       +
                   md_lines.append(f"### `{sig['content']}`")

     

       281
       281
       +
                   

     

       282
       282
       +
                   # Add documentation if available

     

       283
       283
       +
                   if sig["doc"]:

     

       284
       284
       +
                       md_lines.append("")

     

       285
       285
       +
                       md_lines.append(sig["doc"])

     

       286
       286
       +
               md_lines.append("")

     

       287
       287
       +
           

     

       288
       288
       +
           # Process modules

     

       289
       289
       +
           if "module" in sig_by_type:

     

       290
       290
       +
               md_lines.append("## Modules")

     

       291
       291
       +
               for sig in sig_by_type["module"]:

     

       292
       292
       +
                   md_lines.append("")

     

       293
       293
       +
                   md_lines.append(f"### `{sig['content']}`")

     

       294
       294
       +
                   

     

       295
       295
       +
                   # Add documentation if available

     

       296
       296
       +
                   if sig["doc"]:

     

       297
       297
       +
                       md_lines.append("")

     

       298
       298
       +
                       md_lines.append(sig["doc"])

     

       299
       299
       +
               md_lines.append("")

     

       300
       300
       +
           

     

       301
       301
       +
           # Process classes

     

       302
       302
       +
           if "class" in sig_by_type:

     

       303
       303
       +
               md_lines.append("## Classes")

     

       304
       304
       +
               for sig in sig_by_type["class"]:

     

       305
       305
       +
                   md_lines.append("")

     

       306
       306
       +
                   md_lines.append(f"### `{sig['content']}`")

     

       307
       307
       +
                   

     

       308
       308
       +
                   # Add documentation if available

     

       309
       309
       +
                   if sig["doc"]:

     

       310
       310
       +
                       md_lines.append("")

     

       311
       311
       +
                       md_lines.append(sig["doc"])

     

       312
       312
       +
               md_lines.append("")

     

       313
       313
       +
           

     

       314
       314
       +
           # Process remaining signature types

     

       315
       315
       +
           for sig_type, sigs in sig_by_type.items():

     

       316
       316
       +
               if sig_type not in ["type", "val", "module", "class", "exception"]:

     

       317
       317
       +
                   md_lines.append(f"## {sig_type.capitalize()}s")

     

       318
       318
       +
                   for sig in sigs:

     

       319
       319
       +
                       md_lines.append("")

     

       320
       320
       +
                       md_lines.append(f"### `{sig['content']}`")

     

       321
       321
       +
                       

     

       322
       322
       +
                       # Add documentation if available

     

       323
       323
       +
                       if sig["doc"]:

     

       324
       324
       +
                           md_lines.append("")

     

       325
       325
       +
                           md_lines.append(sig["doc"])

     

       326
       326
       +
                   md_lines.append("")

     

       327
       327
       +
           

     

       328
       328
       +
           return "\n".join(md_lines)

     

       329
       329
       +
       

     

       330
       330
       +
       

     

       331
       331
       +
       def build_module_hierarchy(json_files, root_dir):

     

       332
       332
       +
           """Build a hierarchical structure from all the JSON files."""

     

       333
       333
       +
           hierarchy = defaultdict(list)

     

       334
       334
       +
           

     

       335
       335
       +
           for json_file in json_files:

     

       336
       336
       +
               rel_path = os.path.relpath(json_file, root_dir)

     

       337
       337
       +
               package_parts = rel_path.split(os.sep)

     

       338
       338
       +
               

     

       339
       339
       +
               # Skip irrelevant JSON files

     

       340
       340
       +
               if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:

     

       341
       341
       +
                   # For index.html.json, check if it's a module documentation

     

       342
       342
       +
                   if package_parts[-1] == "index.html.json" and len(package_parts) > 1:

     

       343
       343
       +
                       try:

     

       344
       344
       +
                           with open(json_file, 'r', encoding='utf-8') as f:

     

       345
       345
       +
                               json_content = f.read()

     

       346
       346
       +
                           

     

       347
       347
       +
                           # Try to parse the module info

     

       348
       348
       +
                           module_info = extract_module_info(json_content)

     

       349
       349
       +
                           signatures = parse_module_signature(module_info["content"])

     

       350
       350
       +
                           

     

       351
       351
       +
                           # Group by package/library

     

       352
       352
       +
                           if len(package_parts) > 1:

     

       353
       353
       +
                               package_name = package_parts[0]

     

       354
       354
       +
                               hierarchy[package_name].append({

     

       355
       355
       +
                                   "file": json_file,

     

       356
       356
       +
                                   "module_info": module_info,

     

       357
       357
       +
                                   "signatures": signatures,

     

       358
       358
       +
                                   "path_parts": package_parts

     

       359
       359
       +
                               })

     

       360
       360
       +
                       except Exception as e:

     

       361
       361
       +
                           print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       362
       362
       +
                   

     

       363
       363
       +
                   continue

     

       364
       364
       +
               

     

       365
       365
       +
               # Try to parse other JSON files (non-index.html.json)

     

       366
       366
       +
               try:

     

       367
       367
       +
                   with open(json_file, 'r', encoding='utf-8') as f:

     

       368
       368
       +
                       json_content = f.read()

     

       369
       369
       +
                   

     

       370
       370
       +
                   module_info = extract_module_info(json_content)

     

       371
       371
       +
                   signatures = parse_module_signature(module_info["content"])

     

       372
       372
       +
                   

     

       373
       373
       +
                   # Group by package/library

     

       374
       374
       +
                   if len(package_parts) > 1:

     

       375
       375
       +
                       package_name = package_parts[0]

     

       376
       376
       +
                       hierarchy[package_name].append({

     

       377
       377
       +
                           "file": json_file,

     

       378
       378
       +
                           "module_info": module_info,

     

       379
       379
       +
                           "signatures": signatures,

     

       380
       380
       +
                           "path_parts": package_parts

     

       381
       381
       +
                       })

     

       382
       382
       +
               except Exception as e:

     

       383
       383
       +
                   print(f"Error processing {json_file}: {e}", file=sys.stderr)

     

       384
       384
       +
           

     

       385
       385
       +
           return hierarchy

     

       386
       386
       +
       

     

       387
       387
       +
       

     

       388
       388
       +
       def sort_modules_hierarchically(modules):

     

       389
       389
       +
           """Sort modules to ensure proper hierarchical presentation."""

     

       390
       390
       +
           # First sort by breadcrumb length (shorter = higher in hierarchy)

     

       391
       391
       +
           # Then sort alphabetically within the same level

     

       392
       392
       +
           return sorted(modules, key=lambda x: (

     

       393
       393
       +
               len(x["module_info"]["breadcrumbs"]), 

     

       394
       394
       +
               x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""

     

       395
       395
       +
           ))

     

       396
       396
       +
       

     

       397
       397
       +
       

     

       398
       398
       +
       def generate_markdown_library(lib_name, modules):

     

       399
       399
       +
           """Generate markdown for a specific library."""

     

       400
       400
       +
           md_lines = []

     

       401
       401
       +
           

     

       402
       402
       +
           md_lines.append(f"# Library: {lib_name}")

     

       403
       403
       +
           md_lines.append("")

     

       404
       404
       +
           

     

       405
       405
       +
           # Sort modules hierarchically

     

       406
       406
       +
           sorted_modules = sort_modules_hierarchically(modules)

     

       407
       407
       +
           

     

       408
       408
       +
           for module in sorted_modules:

     

       409
       409
       +
               module_md = generate_markdown(module["module_info"], module["signatures"])

     

       410
       410
       +
               md_lines.append(module_md)

     

       411
       411
       +
               md_lines.append("\n---\n")

     

       412
       412
       +
           

     

       413
       413
       +
           return "\n".join(md_lines)

     

       414
       414
       +
       

     

       415
       415
       +
       

     

       416
       416
       +
       def main():

     

       417
       417
       +
           parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')

     

       418
       418
       +
           parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')

     

       419
       419
       +
           parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')

     

       420
       420
       +
           parser.add_argument('--package', '-p', help='Focus on a specific package/library')

     

       421
       421
       +
           parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')

     

       422
       422
       +
           args = parser.parse_args()

     

       423
       423
       +
           

     

       424
       424
       +
           html_dir = Path(args.html_dir)

     

       425
       425
       +
           

     

       426
       426
       +
           if not html_dir.exists() or not html_dir.is_dir():

     

       427
       427
       +
               print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)

     

       428
       428
       +
               sys.exit(1)

     

       429
       429
       +
           

     

       430
       430
       +
           # Find all JSON files

     

       431
       431
       +
           json_files = []

     

       432
       432
       +
           for root, _, files in os.walk(html_dir):

     

       433
       433
       +
               for file in files:

     

       434
       434
       +
                   if file.endswith('.html.json'):

     

       435
       435
       +
                       json_files.append(os.path.join(root, file))

     

       436
       436
       +
           

     

       437
       437
       +
           if args.verbose:

     

       438
       438
       +
               print(f"Found {len(json_files)} JSON files", file=sys.stderr)

     

       439
       439
       +
           

     

       440
       440
       +
           # Build module hierarchy

     

       441
       441
       +
           hierarchy = build_module_hierarchy(json_files, html_dir)

     

       442
       442
       +
           

     

       443
       443
       +
           if args.verbose:

     

       444
       444
       +
               print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)

     

       445
       445
       +
               for lib, modules in hierarchy.items():

     

       446
       446
       +
                   print(f"  - {lib}: {len(modules)} modules", file=sys.stderr)

     

       447
       447
       +
           

     

       448
       448
       +
           # Generate markdown for all or specific package

     

       449
       449
       +
           if args.package and args.package in hierarchy:

     

       450
       450
       +
               markdown = generate_markdown_library(args.package, hierarchy[args.package])

     

       451
       451
       +
           else:

     

       452
       452
       +
               # Combine all packages

     

       453
       453
       +
               markdown_parts = []

     

       454
       454
       +
               for lib_name, modules in sorted(hierarchy.items()):

     

       455
       455
       +
                   if args.verbose:

     

       456
       456
       +
                       print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)

     

       457
       457
       +
                   lib_md = generate_markdown_library(lib_name, modules)

     

       458
       458
       +
                   markdown_parts.append(lib_md)

     

       459
       459
       +
                   markdown_parts.append("\n\n")

     

       460
       460
       +
               

     

       461
       461
       +
               markdown = "\n".join(markdown_parts)

     

       462
       462
       +
           

     

       463
       463
       +
           # Write markdown to output file

     

       464
       464
       +
           with open(args.output, 'w', encoding='utf-8') as f:

     

       465
       465
       +
               f.write(markdown)

     

       466
       466
       +
           

     

       467
       467
       +
           print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)

     

       468
       468
       +
       

     

       469
       469
       +
       

     

       470
       470
       +
       if __name__ == "__main__":

     

       471
       471
       +
           main()