1{
2 lib,
3 beautifulsoup4,
4 buildPythonPackage,
5 buildNpmPackage,
6 fetchFromGitHub,
7 html5lib,
8 lxml,
9 nodejs,
10 pytestCheckHook,
11 pythonOlder,
12 regex,
13 setuptools,
14 testers,
15 readabilipy,
16}:
17
18buildPythonPackage rec {
19 pname = "readabilipy";
20 version = "0.3.0";
21 pyproject = true;
22
23 disabled = pythonOlder "3.7";
24
25 src = fetchFromGitHub {
26 owner = "alan-turing-institute";
27 repo = "ReadabiliPy";
28 tag = "v${version}";
29 hash = "sha256-FYdSbq3rm6fBHm5fDRAB0airX9fNcUGs1wHN4i6mnG0=";
30 };
31
32 patches = [
33 # Fix test failures with Python 3.13.6
34 # https://github.com/alan-turing-institute/ReadabiliPy/pull/116
35 ./python3.13.6-compatibility.patch
36 ];
37
38 javascript = buildNpmPackage {
39 pname = "readabilipy-javascript";
40 inherit version;
41
42 src = src;
43 sourceRoot = "${src.name}/readabilipy/javascript";
44 npmDepsHash = "sha256-LiPSCZamkJjivzpawG7H9IEXYjn3uzFeY2vfucyHfUo=";
45
46 postPatch = ''
47 cp ${./package-lock.json} package-lock.json
48 '';
49
50 dontNpmBuild = true;
51 };
52
53 build-system = [ setuptools ];
54
55 dependencies = [
56 beautifulsoup4
57 html5lib
58 lxml
59 regex
60 ];
61
62 postPatch = ''
63 ln -s $javascript/lib/node_modules/ReadabiliPy/node_modules readabilipy/javascript/node_modules
64 echo "recursive-include readabilipy/javascript *" >MANIFEST.in
65 '';
66
67 postInstall = ''
68 wrapProgram $out/bin/readabilipy \
69 --prefix PATH : ${nodejs}/bin
70 '';
71
72 nativeCheckInputs = [
73 pytestCheckHook
74 nodejs
75 ];
76
77 pythonImportsCheck = [ "readabilipy" ];
78
79 disabledTestPaths = [
80 # Exclude benchmarks
81 "tests/test_benchmarking.py"
82 ];
83
84 disabledTests = [
85 # IndexError: list index out of range
86 "test_html_blacklist"
87 "test_prune_div_with_one_empty_span"
88 "test_prune_div_with_one_whitespace_paragraph"
89 "test_empty_page"
90 "test_contentless_page"
91 "test_extract_title"
92 "test_iframe_containing_tags"
93 "test_iframe_with_source"
94 ];
95
96 passthru = {
97 tests.version = testers.testVersion {
98 package = readabilipy;
99 command = "readabilipy --version";
100 version = "${version} (Readability.js supported: yes)";
101 };
102 };
103
104 meta = with lib; {
105 description = "HTML content extractor";
106 homepage = "https://github.com/alan-turing-institute/ReadabiliPy";
107 changelog = "https://github.com/alan-turing-institute/ReadabiliPy/blob/${src.tag}/CHANGELOG.md";
108 license = licenses.mit;
109 maintainers = with maintainers; [ fab ];
110 mainProgram = "readabilipy";
111 };
112}