1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 fetchFromGitHub,
6
7 # build-system
8 cython,
9 distlib,
10 grpcio-tools,
11 jinja2,
12 jsonpickle,
13 jsonschema,
14 mypy-protobuf,
15 redis,
16 setuptools,
17 yapf,
18
19 # dependencies
20 beartype,
21 crcmod,
22 dill,
23 fastavro,
24 fasteners,
25 grpcio,
26 hdfs,
27 httplib2,
28 numpy,
29 objsize,
30 orjson,
31 proto-plus,
32 protobuf,
33 pyarrow,
34 pydot,
35 pymongo,
36 python-dateutil,
37 pytz,
38 regex,
39 requests,
40 typing-extensions,
41 zstandard,
42
43 # tests
44 python,
45 docstring-parser,
46 freezegun,
47 hypothesis,
48 mock,
49 pandas,
50 parameterized,
51 psycopg2,
52 pyhamcrest,
53 pytest-xdist,
54 pytestCheckHook,
55 pyyaml,
56 requests-mock,
57 scikit-learn,
58 sqlalchemy,
59 tenacity,
60 testcontainers,
61 pythonAtLeast,
62}:
63
64buildPythonPackage rec {
65 pname = "apache-beam";
66 version = "2.68.0";
67 pyproject = true;
68
69 src = fetchFromGitHub {
70 owner = "apache";
71 repo = "beam";
72 tag = "v${version}";
73 hash = "sha256-ENtvgu9qT1OPsDqFJQzKgIATE7F+S5I+AfoBT2iEL8M=";
74 };
75
76 sourceRoot = "${src.name}/sdks/python";
77
78 postPatch = ''
79 substituteInPlace pyproject.toml \
80 --replace-fail "==" ">=" \
81 --replace-fail ",<2.3.0" ""
82
83 substituteInPlace setup.py \
84 --replace-fail " copy_tests_from_docs()" ""
85 '';
86
87 pythonRelaxDeps = [
88 "grpcio"
89 "jsonpickle"
90
91 # As of apache-beam v2.55.1, the requirement is cloudpickle~=2.2.1, but
92 # the current (2024-04-20) nixpkgs's pydot version is 3.0.0.
93 "cloudpickle"
94
95 # See https://github.com/NixOS/nixpkgs/issues/156957
96 "dill"
97
98 "numpy"
99
100 "protobuf"
101
102 # As of apache-beam v2.45.0, the requirement is pyarrow<10.0.0,>=0.15.1, but
103 # the current (2023-02-22) nixpkgs's pyarrow version is 11.0.0.
104 "pyarrow"
105
106 "pydot"
107 "redis"
108 ];
109
110 build-system = [
111 cython
112 distlib
113 grpcio-tools
114 jinja2
115 jsonpickle
116 jsonschema
117 mypy-protobuf
118 redis
119 setuptools
120 yapf
121 ];
122
123 dependencies = [
124 beartype
125 crcmod
126 dill
127 fastavro
128 fasteners
129 grpcio
130 hdfs
131 httplib2
132 numpy
133 objsize
134 orjson
135 proto-plus
136 protobuf
137 pyarrow
138 pydot
139 pymongo
140 python-dateutil
141 pytz
142 regex
143 requests
144 typing-extensions
145 zstandard
146 ];
147
148 enableParallelBuilding = true;
149
150 __darwinAllowLocalNetworking = true;
151
152 pythonImportsCheck = [ "apache_beam" ];
153
154 nativeCheckInputs = [
155 docstring-parser
156 freezegun
157 hypothesis
158 mock
159 pandas
160 parameterized
161 psycopg2
162 pyhamcrest
163 pytest-xdist
164 pytestCheckHook
165 pyyaml
166 requests-mock
167 scikit-learn
168 sqlalchemy
169 tenacity
170 testcontainers
171 ];
172
173 # Make sure we're running the tests for the actually installed
174 # package, so that cython's .so files are available.
175 preCheck = ''
176 cd $out/${python.sitePackages}
177 '';
178
179 disabledTestPaths = [
180 # FileNotFoundError: [Errno 2] No such file or directory:
181 # '/nix/store/...-python3.13-apache-beam-2.67.0/lib/python3.13/site-packages/apache_beam/yaml/docs/yaml.md'
182 "apache_beam/yaml/examples/testing/examples_test.py"
183
184 # from google.cloud.sql.connector import Connector
185 # E ModuleNotFoundError: No module named 'google.cloud'
186 "apache_beam/ml/rag/ingestion/cloudsql_it_test.py"
187
188 # Fails with
189 # _______ ERROR collecting apache_beam/io/external/xlang_jdbcio_it_test.py _______
190 # apache_beam/io/external/xlang_jdbcio_it_test.py:80: in <module>
191 # class CrossLanguageJdbcIOTest(unittest.TestCase):
192 # apache_beam/io/external/xlang_jdbcio_it_test.py:99: in CrossLanguageJdbcIOTest
193 # container_init: Callable[[], Union[PostgresContainer, MySqlContainer]],
194 # E NameError: name 'MySqlContainer' is not defined
195 #
196 "apache_beam/io/external/xlang_jdbcio_it_test.py"
197
198 # These tests depend on the availability of specific servers backends.
199 "apache_beam/runners/portability/flink_runner_test.py"
200 "apache_beam/runners/portability/samza_runner_test.py"
201 "apache_beam/runners/portability/spark_runner_test.py"
202
203 # Fails starting from dill 0.3.6 because it tries to pickle pytest globals:
204 # https://github.com/uqfoundation/dill/issues/482#issuecomment-1139017499.
205 "apache_beam/transforms/window_test.py"
206
207 # See https://github.com/apache/beam/issues/25390.
208 "apache_beam/coders/slow_coders_test.py"
209 "apache_beam/dataframe/pandas_doctests_test.py"
210 "apache_beam/typehints/typed_pipeline_test.py"
211 "apache_beam/coders/fast_coders_test.py"
212 "apache_beam/dataframe/schemas_test.py"
213
214 # Fails with TypeError: cannot pickle 'EncodedFile' instances
215 # Upstream issue https://github.com/apache/beam/issues/33889
216 "apache_beam/options/pipeline_options_validator_test.py"
217 "apache_beam/yaml/main_test.py"
218 "apache_beam/yaml/programming_guide_test.py"
219 "apache_beam/yaml/readme_test.py"
220 "apache_beam/yaml/yaml_combine_test.py"
221 "apache_beam/yaml/yaml_enrichment_test.py"
222 "apache_beam/yaml/yaml_io_test.py"
223 "apache_beam/yaml/yaml_join_test.py"
224 "apache_beam/yaml/yaml_mapping_test.py"
225 "apache_beam/yaml/yaml_ml_test.py"
226 "apache_beam/yaml/yaml_provider_unit_test.py"
227
228 # FIXME AttributeError: 'Namespace' object has no attribute 'test_pipeline_options'
229 # Upstream issue https://github.com/apache/beam/issues/33853
230 "apache_beam/runners/portability/prism_runner_test.py"
231
232 # FIXME ValueError: Unable to run pipeline with requirement: unsupported_requirement
233 # Upstream issuehttps://github.com/apache/beam/issues/33853
234 "apache_beam/yaml/yaml_transform_scope_test.py"
235 "apache_beam/yaml/yaml_transform_test.py"
236 "apache_beam/yaml/yaml_transform_unit_test.py"
237 "apache_beam/yaml/yaml_udf_test.py"
238 "apache_beam/dataframe/frames_test.py"
239
240 # FIXME Those tests do not terminate due to a grpc error (threading issue)
241 # grpc_status:14, grpc_message:"Cancelling all calls"}"
242 # Upstream issue https://github.com/apache/beam/issues/33851
243 "apache_beam/runners/portability/portable_runner_test.py"
244 ]
245 ++ lib.optionals (pythonAtLeast "3.13") [
246 # > instruction = ofs_table[pc]
247 # E KeyError: 18
248 "apache_beam/typehints/trivial_inference_test.py"
249 ];
250
251 disabledTests = [
252 # RuntimeError: This pipeline runs with the pipeline option --update_compatibility_version=2.67.0 or earlier.
253 # When running with this option on SDKs 2.68.0 or later, you must ensure dill==0.3.1.1 is installed. Error
254 "test_reshuffle_custom_window_preserves_metadata_1"
255 "test_reshuffle_default_window_preserves_metadata_1"
256
257 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs'
258 # https://github.com/apache/beam/issues/33854
259 "test_runner_overrides_default_pickler"
260
261 # AssertionError: Lists differ
262 "test_default_resources"
263 "test_files_to_stage"
264 "test_main_session_not_staged_when_using_cloudpickle"
265 "test_no_main_session"
266 "test_populate_requirements_cache_with_local_files"
267 "test_requirements_cache_not_populated_when_cache_disabled"
268 "test_sdk_location_default"
269 "test_sdk_location_http"
270 "test_sdk_location_local_directory"
271 "test_sdk_location_local_source_file"
272 "test_sdk_location_local_wheel_file"
273 "test_sdk_location_remote_source_file"
274 "test_sdk_location_remote_wheel_file"
275 "test_with_extra_packages"
276 "test_with_jar_packages"
277 "test_with_main_session"
278 "test_with_pypi_requirements"
279 "test_with_requirements_file"
280 "test_with_requirements_file_and_cache"
281
282 # ValueError: SplitAtFraction test completed vacuously: no non-trivial split fractions found
283 "test_dynamic_work_rebalancing"
284
285 # fixture 'self' not found
286 "test_with_batched_input_exceeds_size_limit"
287 "test_with_batched_input_splits_large_batch"
288
289 # IndexError: list index out of range
290 "test_only_sample_exceptions"
291
292 # AssertionError: False is not true
293 "test_samples_all_with_both_experiments"
294 ]
295 ++ lib.optionals stdenv.hostPlatform.isDarwin [
296 # PermissionError: [Errno 13] Permission denied: '/tmp/...'
297 "test_cache_manager_uses_local_ib_cache_root"
298 "test_describe_all_recordings"
299 "test_find_out_correct_user_pipeline"
300 "test_get_cache_manager_creates_cache_manager_if_absent"
301 "test_streaming_cache_uses_local_ib_cache_root"
302 "test_track_user_pipeline_cleanup_non_inspectable_pipeline"
303 ]
304 ++ lib.optionals (pythonAtLeast "3.12") [
305 # TypeError: Could not determine schema for type hint Any.
306 "test_batching_beam_row_input"
307 "test_auto_convert"
308 "test_unbatching_series"
309 "test_batching_beam_row_to_dataframe"
310
311 # AssertionError: Any != <class 'int'>
312 "test_pycallable_map"
313 "testAlwaysReturnsEarly"
314
315 # TypeError: Expected Iterator in return type annotatio
316 "test_get_output_batch_type"
317 ];
318
319 meta = {
320 description = "Unified model for defining both batch and streaming data-parallel processing pipelines";
321 homepage = "https://beam.apache.org/";
322 changelog = "https://github.com/apache/beam/blob/${src.tag}/CHANGES.md";
323 license = lib.licenses.asl20;
324 maintainers = with lib.maintainers; [ ndl ];
325 };
326}