1#! /usr/bin/env nix-shell
2#! nix-shell -i "python3 -I" -p "python3.withPackages(p: with p; [ aiohttp rich structlog ])"
3
4from argparse import ArgumentParser, Namespace
5from collections import defaultdict
6from collections.abc import Mapping, Sequence
7from enum import IntEnum
8from http import HTTPStatus
9from pathlib import Path
10from typing import Optional
11import asyncio, json, logging
12
13import aiohttp, structlog
14from structlog.contextvars import bound_contextvars as log_context
15
16
17LogLevel = IntEnum('LogLevel', {
18 lvl: getattr(logging, lvl)
19 for lvl in ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
20})
21LogLevel.__str__ = lambda self: self.name
22
23
24EXPECTED_STATUS=frozenset((
25 HTTPStatus.OK, HTTPStatus.FOUND,
26 HTTPStatus.NOT_FOUND,
27))
28
29async def check(session: aiohttp.ClientSession, manpage: str, url: str) -> HTTPStatus:
30 with log_context(manpage=manpage, url=url):
31 logger.debug("Checking")
32 async with session.head(url) as resp:
33 st = HTTPStatus(resp.status)
34 match st:
35 case HTTPStatus.OK | HTTPStatus.FOUND:
36 logger.debug("OK!")
37 case HTTPStatus.NOT_FOUND:
38 logger.error("Broken link!")
39 case _ if st < 400:
40 logger.info("Unexpected code", status=st)
41 case _ if 400 <= st < 600:
42 logger.warn("Unexpected error", status=st)
43
44 return st
45
46async def main(urls_path: Path) -> Mapping[HTTPStatus, int]:
47 logger.info(f"Parsing {urls_path}")
48 with urls_path.open() as urls_file:
49 urls = json.load(urls_file)
50
51 count: defaultdict[HTTPStatus, int] = defaultdict(lambda: 0)
52
53 logger.info(f"Checking URLs from {urls_path}")
54 async with aiohttp.ClientSession() as session:
55 for status in asyncio.as_completed([
56 check(session, manpage, url)
57 for manpage, url in urls.items()
58 ]):
59 count[await status]+=1
60
61 ok = count[HTTPStatus.OK] + count[HTTPStatus.FOUND]
62 broken = count[HTTPStatus.NOT_FOUND]
63 unknown = sum(c for st, c in count.items() if st not in EXPECTED_STATUS)
64 logger.info(f"Done: {broken} broken links, "
65 f"{ok} correct links, and {unknown} unexpected status")
66
67 return count
68
69
70def parse_args(args: Optional[Sequence[str]] = None) -> Namespace:
71 parser = ArgumentParser(
72 prog = 'check-manpage-urls',
73 description = 'Check the validity of the manpage URLs linked in the nixpkgs manual',
74 )
75 parser.add_argument(
76 '-l', '--log-level',
77 default = os.getenv('LOG_LEVEL', 'INFO'),
78 type = lambda s: LogLevel[s],
79 choices = list(LogLevel),
80 )
81 parser.add_argument(
82 'file',
83 type = Path,
84 nargs = '?',
85 )
86
87 return parser.parse_args(args)
88
89
90if __name__ == "__main__":
91 import os, sys
92
93 args = parse_args()
94
95 structlog.configure(
96 wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
97 )
98 logger = structlog.getLogger("check-manpage-urls.py")
99
100 urls_path = args.file
101 if urls_path is None:
102 REPO_ROOT = Path(__file__).parent.parent.parent.parent
103 logger.info(f"Assuming we are in a nixpkgs repo rooted at {REPO_ROOT}")
104
105 urls_path = REPO_ROOT / 'doc' / 'manpage-urls.json'
106
107 count = asyncio.run(main(urls_path))
108
109 sys.exit(0 if count[HTTPStatus.NOT_FOUND] == 0 else 1)