1{
2 lib,
3 stdenv,
4 botocore,
5 buildPythonPackage,
6 cryptography,
7 cssselect,
8 defusedxml,
9 fetchFromGitHub,
10 glibcLocales,
11 hatchling,
12 installShellFiles,
13 itemadapter,
14 itemloaders,
15 jmespath,
16 lxml,
17 packaging,
18 parsel,
19 pexpect,
20 protego,
21 pydispatcher,
22 pyopenssl,
23 pytest-xdist,
24 pytestCheckHook,
25 pythonOlder,
26 queuelib,
27 service-identity,
28 setuptools,
29 sybil,
30 testfixtures,
31 tldextract,
32 twisted,
33 uvloop,
34 w3lib,
35 zope-interface,
36}:
37
38buildPythonPackage rec {
39 pname = "scrapy";
40 version = "2.13.3";
41 pyproject = true;
42
43 disabled = pythonOlder "3.8";
44
45 src = fetchFromGitHub {
46 owner = "scrapy";
47 repo = "scrapy";
48 tag = version;
49 hash = "sha256-M+Lko0O0xsEPHLghvIGHxIv22XBXaZsujJ2+bjBzGZ4=";
50 };
51
52 pythonRelaxDeps = [
53 "defusedxml"
54 ];
55
56 build-system = [
57 hatchling
58 ];
59
60 nativeBuildInputs = [
61 installShellFiles
62 setuptools
63 ];
64
65 propagatedBuildInputs = [
66 cryptography
67 cssselect
68 defusedxml
69 itemadapter
70 itemloaders
71 lxml
72 packaging
73 parsel
74 protego
75 pydispatcher
76 pyopenssl
77 queuelib
78 service-identity
79 tldextract
80 twisted
81 w3lib
82 zope-interface
83 ];
84
85 nativeCheckInputs = [
86 botocore
87 glibcLocales
88 jmespath
89 pexpect
90 pytest-xdist
91 pytestCheckHook
92 sybil
93 testfixtures
94 uvloop
95 ];
96
97 LC_ALL = "en_US.UTF-8";
98
99 disabledTestPaths = [
100 "tests/test_proxy_connect.py"
101 "tests/test_utils_display.py"
102 "tests/test_command_check.py"
103
104 # ConnectionRefusedError: [Errno 111] Connection refused
105 "tests/test_feedexport.py::TestFTPFeedStorage::test_append"
106 "tests/test_feedexport.py::TestFTPFeedStorage::test_append_active_mode"
107 "tests/test_feedexport.py::TestFTPFeedStorage::test_overwrite"
108 "tests/test_feedexport.py::TestFTPFeedStorage::test_overwrite_active_mode"
109
110 # this test is testing that the *first* deprecation warning is a specific one
111 # but for some reason we get other deprecation warnings appearing first
112 # but this isn't a material issue and the deprecation warning is still raised
113 "tests/test_spider_start.py::MainTestCase::test_start_deprecated_super"
114
115 # Don't test the documentation
116 "docs"
117 ];
118
119 disabledTests = [
120 # Requires network access
121 "AnonymousFTPTestCase"
122 "FTPFeedStorageTest"
123 "FeedExportTest"
124 "test_custom_asyncio_loop_enabled_true"
125 "test_custom_loop_asyncio"
126 "test_custom_loop_asyncio_deferred_signal"
127 "FileFeedStoragePreFeedOptionsTest" # https://github.com/scrapy/scrapy/issues/5157
128 "test_persist"
129 "test_timeout_download_from_spider_nodata_rcvd"
130 "test_timeout_download_from_spider_server_hangs"
131 "test_unbounded_response"
132 "CookiesMiddlewareTest"
133 # Test fails on Hydra
134 "test_start_requests_laziness"
135 ]
136 ++ lib.optionals stdenv.hostPlatform.isDarwin [
137 "test_xmliter_encoding"
138 "test_download"
139 "test_reactor_default_twisted_reactor_select"
140 "URIParamsSettingTest"
141 "URIParamsFeedOptionTest"
142 # flaky on darwin-aarch64
143 "test_fixed_delay"
144 "test_start_requests_laziness"
145 ];
146
147 postInstall = ''
148 installManPage extras/scrapy.1
149 installShellCompletion --cmd scrapy \
150 --zsh extras/scrapy_zsh_completion \
151 --bash extras/scrapy_bash_completion
152 '';
153
154 pythonImportsCheck = [ "scrapy" ];
155
156 __darwinAllowLocalNetworking = true;
157
158 meta = with lib; {
159 description = "High-level web crawling and web scraping framework";
160 mainProgram = "scrapy";
161 longDescription = ''
162 Scrapy is a fast high-level web crawling and web scraping framework, used to crawl
163 websites and extract structured data from their pages. It can be used for a wide
164 range of purposes, from data mining to monitoring and automated testing.
165 '';
166 homepage = "https://scrapy.org/";
167 changelog = "https://github.com/scrapy/scrapy/raw/${src.tag}/docs/news.rst";
168 license = licenses.bsd3;
169 maintainers = with maintainers; [ vinnymeller ];
170 };
171}