1{ 2 lib, 3 beautifulsoup4, 4 buildPythonPackage, 5 buildNpmPackage, 6 fetchFromGitHub, 7 html5lib, 8 lxml, 9 nodejs, 10 pytestCheckHook, 11 pythonOlder, 12 regex, 13 setuptools, 14 testers, 15 readabilipy, 16}: 17 18buildPythonPackage rec { 19 pname = "readabilipy"; 20 version = "0.3.0"; 21 pyproject = true; 22 23 disabled = pythonOlder "3.7"; 24 25 src = fetchFromGitHub { 26 owner = "alan-turing-institute"; 27 repo = "ReadabiliPy"; 28 tag = "v${version}"; 29 hash = "sha256-FYdSbq3rm6fBHm5fDRAB0airX9fNcUGs1wHN4i6mnG0="; 30 }; 31 32 patches = [ 33 # Fix test failures with Python 3.13.6 34 # https://github.com/alan-turing-institute/ReadabiliPy/pull/116 35 ./python3.13.6-compatibility.patch 36 ]; 37 38 javascript = buildNpmPackage { 39 pname = "readabilipy-javascript"; 40 inherit version; 41 42 src = src; 43 sourceRoot = "${src.name}/readabilipy/javascript"; 44 npmDepsHash = "sha256-LiPSCZamkJjivzpawG7H9IEXYjn3uzFeY2vfucyHfUo="; 45 46 postPatch = '' 47 cp ${./package-lock.json} package-lock.json 48 ''; 49 50 dontNpmBuild = true; 51 }; 52 53 build-system = [ setuptools ]; 54 55 dependencies = [ 56 beautifulsoup4 57 html5lib 58 lxml 59 regex 60 ]; 61 62 postPatch = '' 63 ln -s $javascript/lib/node_modules/ReadabiliPy/node_modules readabilipy/javascript/node_modules 64 echo "recursive-include readabilipy/javascript *" >MANIFEST.in 65 ''; 66 67 postInstall = '' 68 wrapProgram $out/bin/readabilipy \ 69 --prefix PATH : ${nodejs}/bin 70 ''; 71 72 nativeCheckInputs = [ 73 pytestCheckHook 74 nodejs 75 ]; 76 77 pythonImportsCheck = [ "readabilipy" ]; 78 79 disabledTestPaths = [ 80 # Exclude benchmarks 81 "tests/test_benchmarking.py" 82 ]; 83 84 disabledTests = [ 85 # IndexError: list index out of range 86 "test_html_blacklist" 87 "test_prune_div_with_one_empty_span" 88 "test_prune_div_with_one_whitespace_paragraph" 89 "test_empty_page" 90 "test_contentless_page" 91 "test_extract_title" 92 "test_iframe_containing_tags" 93 "test_iframe_with_source" 94 ]; 95 96 passthru = { 97 tests.version = testers.testVersion { 98 package = readabilipy; 99 command = "readabilipy --version"; 100 version = "${version} (Readability.js supported: yes)"; 101 }; 102 }; 103 104 meta = with lib; { 105 description = "HTML content extractor"; 106 homepage = "https://github.com/alan-turing-institute/ReadabiliPy"; 107 changelog = "https://github.com/alan-turing-institute/ReadabiliPy/blob/${src.tag}/CHANGELOG.md"; 108 license = licenses.mit; 109 maintainers = with maintainers; [ fab ]; 110 mainProgram = "readabilipy"; 111 }; 112}