at 24.11-pre 3.4 kB view raw
1#! /usr/bin/env nix-shell 2#! nix-shell -i "python3 -I" -p "python3.withPackages(p: with p; [ aiohttp rich structlog ])" 3 4from argparse import ArgumentParser, Namespace 5from collections import defaultdict 6from collections.abc import Mapping, Sequence 7from enum import IntEnum 8from http import HTTPStatus 9from pathlib import Path 10from typing import Optional 11import asyncio, json, logging 12 13import aiohttp, structlog 14from structlog.contextvars import bound_contextvars as log_context 15 16 17LogLevel = IntEnum('LogLevel', { 18 lvl: getattr(logging, lvl) 19 for lvl in ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL') 20}) 21LogLevel.__str__ = lambda self: self.name 22 23 24EXPECTED_STATUS=frozenset(( 25 HTTPStatus.OK, HTTPStatus.FOUND, 26 HTTPStatus.NOT_FOUND, 27)) 28 29async def check(session: aiohttp.ClientSession, manpage: str, url: str) -> HTTPStatus: 30 with log_context(manpage=manpage, url=url): 31 logger.debug("Checking") 32 async with session.head(url) as resp: 33 st = HTTPStatus(resp.status) 34 match st: 35 case HTTPStatus.OK | HTTPStatus.FOUND: 36 logger.debug("OK!") 37 case HTTPStatus.NOT_FOUND: 38 logger.error("Broken link!") 39 case _ if st < 400: 40 logger.info("Unexpected code", status=st) 41 case _ if 400 <= st < 600: 42 logger.warn("Unexpected error", status=st) 43 44 return st 45 46async def main(urls_path: Path) -> Mapping[HTTPStatus, int]: 47 logger.info(f"Parsing {urls_path}") 48 with urls_path.open() as urls_file: 49 urls = json.load(urls_file) 50 51 count: defaultdict[HTTPStatus, int] = defaultdict(lambda: 0) 52 53 logger.info(f"Checking URLs from {urls_path}") 54 async with aiohttp.ClientSession() as session: 55 for status in asyncio.as_completed([ 56 check(session, manpage, url) 57 for manpage, url in urls.items() 58 ]): 59 count[await status]+=1 60 61 ok = count[HTTPStatus.OK] + count[HTTPStatus.FOUND] 62 broken = count[HTTPStatus.NOT_FOUND] 63 unknown = sum(c for st, c in count.items() if st not in EXPECTED_STATUS) 64 logger.info(f"Done: {broken} broken links, " 65 f"{ok} correct links, and {unknown} unexpected status") 66 67 return count 68 69 70def parse_args(args: Optional[Sequence[str]] = None) -> Namespace: 71 parser = ArgumentParser( 72 prog = 'check-manpage-urls', 73 description = 'Check the validity of the manpage URLs linked in the nixpkgs manual', 74 ) 75 parser.add_argument( 76 '-l', '--log-level', 77 default = os.getenv('LOG_LEVEL', 'INFO'), 78 type = lambda s: LogLevel[s], 79 choices = list(LogLevel), 80 ) 81 parser.add_argument( 82 'file', 83 type = Path, 84 nargs = '?', 85 ) 86 87 return parser.parse_args(args) 88 89 90if __name__ == "__main__": 91 import os, sys 92 93 args = parse_args() 94 95 structlog.configure( 96 wrapper_class=structlog.make_filtering_bound_logger(args.log_level), 97 ) 98 logger = structlog.getLogger("check-manpage-urls.py") 99 100 urls_path = args.file 101 if urls_path is None: 102 REPO_ROOT = Path(__file__).parent.parent.parent.parent 103 logger.info(f"Assuming we are in a nixpkgs repo rooted at {REPO_ROOT}") 104 105 urls_path = REPO_ROOT / 'doc' / 'manpage-urls.json' 106 107 count = asyncio.run(main(urls_path)) 108 109 sys.exit(0 if count[HTTPStatus.NOT_FOUND] == 0 else 1)