1{ 2 lib, 3 stdenv, 4 buildPythonPackage, 5 fetchFromGitHub, 6 7 # build-system 8 cython, 9 distlib, 10 grpcio-tools, 11 jinja2, 12 jsonpickle, 13 jsonschema, 14 mypy-protobuf, 15 redis, 16 setuptools, 17 yapf, 18 19 # dependencies 20 beartype, 21 crcmod, 22 dill, 23 fastavro, 24 fasteners, 25 grpcio, 26 hdfs, 27 httplib2, 28 numpy, 29 objsize, 30 orjson, 31 proto-plus, 32 protobuf, 33 pyarrow, 34 pydot, 35 pymongo, 36 python-dateutil, 37 pytz, 38 regex, 39 requests, 40 typing-extensions, 41 zstandard, 42 43 # tests 44 python, 45 docstring-parser, 46 freezegun, 47 hypothesis, 48 mock, 49 pandas, 50 parameterized, 51 psycopg2, 52 pyhamcrest, 53 pytest-xdist, 54 pytestCheckHook, 55 pyyaml, 56 requests-mock, 57 scikit-learn, 58 sqlalchemy, 59 tenacity, 60 testcontainers, 61 pythonAtLeast, 62}: 63 64buildPythonPackage rec { 65 pname = "apache-beam"; 66 version = "2.68.0"; 67 pyproject = true; 68 69 src = fetchFromGitHub { 70 owner = "apache"; 71 repo = "beam"; 72 tag = "v${version}"; 73 hash = "sha256-ENtvgu9qT1OPsDqFJQzKgIATE7F+S5I+AfoBT2iEL8M="; 74 }; 75 76 sourceRoot = "${src.name}/sdks/python"; 77 78 postPatch = '' 79 substituteInPlace pyproject.toml \ 80 --replace-fail "==" ">=" \ 81 --replace-fail ",<2.3.0" "" 82 83 substituteInPlace setup.py \ 84 --replace-fail " copy_tests_from_docs()" "" 85 ''; 86 87 pythonRelaxDeps = [ 88 "grpcio" 89 "jsonpickle" 90 91 # As of apache-beam v2.55.1, the requirement is cloudpickle~=2.2.1, but 92 # the current (2024-04-20) nixpkgs's pydot version is 3.0.0. 93 "cloudpickle" 94 95 # See https://github.com/NixOS/nixpkgs/issues/156957 96 "dill" 97 98 "numpy" 99 100 "protobuf" 101 102 # As of apache-beam v2.45.0, the requirement is pyarrow<10.0.0,>=0.15.1, but 103 # the current (2023-02-22) nixpkgs's pyarrow version is 11.0.0. 104 "pyarrow" 105 106 "pydot" 107 "redis" 108 ]; 109 110 build-system = [ 111 cython 112 distlib 113 grpcio-tools 114 jinja2 115 jsonpickle 116 jsonschema 117 mypy-protobuf 118 redis 119 setuptools 120 yapf 121 ]; 122 123 dependencies = [ 124 beartype 125 crcmod 126 dill 127 fastavro 128 fasteners 129 grpcio 130 hdfs 131 httplib2 132 numpy 133 objsize 134 orjson 135 proto-plus 136 protobuf 137 pyarrow 138 pydot 139 pymongo 140 python-dateutil 141 pytz 142 regex 143 requests 144 typing-extensions 145 zstandard 146 ]; 147 148 enableParallelBuilding = true; 149 150 __darwinAllowLocalNetworking = true; 151 152 pythonImportsCheck = [ "apache_beam" ]; 153 154 nativeCheckInputs = [ 155 docstring-parser 156 freezegun 157 hypothesis 158 mock 159 pandas 160 parameterized 161 psycopg2 162 pyhamcrest 163 pytest-xdist 164 pytestCheckHook 165 pyyaml 166 requests-mock 167 scikit-learn 168 sqlalchemy 169 tenacity 170 testcontainers 171 ]; 172 173 # Make sure we're running the tests for the actually installed 174 # package, so that cython's .so files are available. 175 preCheck = '' 176 cd $out/${python.sitePackages} 177 ''; 178 179 disabledTestPaths = [ 180 # FileNotFoundError: [Errno 2] No such file or directory: 181 # '/nix/store/...-python3.13-apache-beam-2.67.0/lib/python3.13/site-packages/apache_beam/yaml/docs/yaml.md' 182 "apache_beam/yaml/examples/testing/examples_test.py" 183 184 # from google.cloud.sql.connector import Connector 185 # E ModuleNotFoundError: No module named 'google.cloud' 186 "apache_beam/ml/rag/ingestion/cloudsql_it_test.py" 187 188 # Fails with 189 # _______ ERROR collecting apache_beam/io/external/xlang_jdbcio_it_test.py _______ 190 # apache_beam/io/external/xlang_jdbcio_it_test.py:80: in <module> 191 # class CrossLanguageJdbcIOTest(unittest.TestCase): 192 # apache_beam/io/external/xlang_jdbcio_it_test.py:99: in CrossLanguageJdbcIOTest 193 # container_init: Callable[[], Union[PostgresContainer, MySqlContainer]], 194 # E NameError: name 'MySqlContainer' is not defined 195 # 196 "apache_beam/io/external/xlang_jdbcio_it_test.py" 197 198 # These tests depend on the availability of specific servers backends. 199 "apache_beam/runners/portability/flink_runner_test.py" 200 "apache_beam/runners/portability/samza_runner_test.py" 201 "apache_beam/runners/portability/spark_runner_test.py" 202 203 # Fails starting from dill 0.3.6 because it tries to pickle pytest globals: 204 # https://github.com/uqfoundation/dill/issues/482#issuecomment-1139017499. 205 "apache_beam/transforms/window_test.py" 206 207 # See https://github.com/apache/beam/issues/25390. 208 "apache_beam/coders/slow_coders_test.py" 209 "apache_beam/dataframe/pandas_doctests_test.py" 210 "apache_beam/typehints/typed_pipeline_test.py" 211 "apache_beam/coders/fast_coders_test.py" 212 "apache_beam/dataframe/schemas_test.py" 213 214 # Fails with TypeError: cannot pickle 'EncodedFile' instances 215 # Upstream issue https://github.com/apache/beam/issues/33889 216 "apache_beam/options/pipeline_options_validator_test.py" 217 "apache_beam/yaml/main_test.py" 218 "apache_beam/yaml/programming_guide_test.py" 219 "apache_beam/yaml/readme_test.py" 220 "apache_beam/yaml/yaml_combine_test.py" 221 "apache_beam/yaml/yaml_enrichment_test.py" 222 "apache_beam/yaml/yaml_io_test.py" 223 "apache_beam/yaml/yaml_join_test.py" 224 "apache_beam/yaml/yaml_mapping_test.py" 225 "apache_beam/yaml/yaml_ml_test.py" 226 "apache_beam/yaml/yaml_provider_unit_test.py" 227 228 # FIXME AttributeError: 'Namespace' object has no attribute 'test_pipeline_options' 229 # Upstream issue https://github.com/apache/beam/issues/33853 230 "apache_beam/runners/portability/prism_runner_test.py" 231 232 # FIXME ValueError: Unable to run pipeline with requirement: unsupported_requirement 233 # Upstream issuehttps://github.com/apache/beam/issues/33853 234 "apache_beam/yaml/yaml_transform_scope_test.py" 235 "apache_beam/yaml/yaml_transform_test.py" 236 "apache_beam/yaml/yaml_transform_unit_test.py" 237 "apache_beam/yaml/yaml_udf_test.py" 238 "apache_beam/dataframe/frames_test.py" 239 240 # FIXME Those tests do not terminate due to a grpc error (threading issue) 241 # grpc_status:14, grpc_message:"Cancelling all calls"}" 242 # Upstream issue https://github.com/apache/beam/issues/33851 243 "apache_beam/runners/portability/portable_runner_test.py" 244 ] 245 ++ lib.optionals (pythonAtLeast "3.13") [ 246 # > instruction = ofs_table[pc] 247 # E KeyError: 18 248 "apache_beam/typehints/trivial_inference_test.py" 249 ]; 250 251 disabledTests = [ 252 # RuntimeError: This pipeline runs with the pipeline option --update_compatibility_version=2.67.0 or earlier. 253 # When running with this option on SDKs 2.68.0 or later, you must ensure dill==0.3.1.1 is installed. Error 254 "test_reshuffle_custom_window_preserves_metadata_1" 255 "test_reshuffle_default_window_preserves_metadata_1" 256 257 # AttributeError: 'MaybeReshuffle' object has no attribute 'side_inputs' 258 # https://github.com/apache/beam/issues/33854 259 "test_runner_overrides_default_pickler" 260 261 # AssertionError: Lists differ 262 "test_default_resources" 263 "test_files_to_stage" 264 "test_main_session_not_staged_when_using_cloudpickle" 265 "test_no_main_session" 266 "test_populate_requirements_cache_with_local_files" 267 "test_requirements_cache_not_populated_when_cache_disabled" 268 "test_sdk_location_default" 269 "test_sdk_location_http" 270 "test_sdk_location_local_directory" 271 "test_sdk_location_local_source_file" 272 "test_sdk_location_local_wheel_file" 273 "test_sdk_location_remote_source_file" 274 "test_sdk_location_remote_wheel_file" 275 "test_with_extra_packages" 276 "test_with_jar_packages" 277 "test_with_main_session" 278 "test_with_pypi_requirements" 279 "test_with_requirements_file" 280 "test_with_requirements_file_and_cache" 281 282 # ValueError: SplitAtFraction test completed vacuously: no non-trivial split fractions found 283 "test_dynamic_work_rebalancing" 284 285 # fixture 'self' not found 286 "test_with_batched_input_exceeds_size_limit" 287 "test_with_batched_input_splits_large_batch" 288 289 # IndexError: list index out of range 290 "test_only_sample_exceptions" 291 292 # AssertionError: False is not true 293 "test_samples_all_with_both_experiments" 294 ] 295 ++ lib.optionals stdenv.hostPlatform.isDarwin [ 296 # PermissionError: [Errno 13] Permission denied: '/tmp/...' 297 "test_cache_manager_uses_local_ib_cache_root" 298 "test_describe_all_recordings" 299 "test_find_out_correct_user_pipeline" 300 "test_get_cache_manager_creates_cache_manager_if_absent" 301 "test_streaming_cache_uses_local_ib_cache_root" 302 "test_track_user_pipeline_cleanup_non_inspectable_pipeline" 303 ] 304 ++ lib.optionals (pythonAtLeast "3.12") [ 305 # TypeError: Could not determine schema for type hint Any. 306 "test_batching_beam_row_input" 307 "test_auto_convert" 308 "test_unbatching_series" 309 "test_batching_beam_row_to_dataframe" 310 311 # AssertionError: Any != <class 'int'> 312 "test_pycallable_map" 313 "testAlwaysReturnsEarly" 314 315 # TypeError: Expected Iterator in return type annotatio 316 "test_get_output_batch_type" 317 ]; 318 319 meta = { 320 description = "Unified model for defining both batch and streaming data-parallel processing pipelines"; 321 homepage = "https://beam.apache.org/"; 322 changelog = "https://github.com/apache/beam/blob/${src.tag}/CHANGES.md"; 323 license = lib.licenses.asl20; 324 maintainers = with lib.maintainers; [ ndl ]; 325 }; 326}