at master 7.3 kB view raw
1{ 2 lib, 3 stdenv, 4 buildPythonPackage, 5 fetchFromGitHub, 6 7 # build-system 8 cython, 9 poetry-core, 10 setuptools, 11 12 # dependencies 13 cachetools, 14 click, 15 fsspec, 16 google-auth, 17 mmh3, 18 pydantic, 19 pyparsing, 20 pyroaring, 21 ray, 22 requests, 23 rich, 24 sortedcontainers, 25 strictyaml, 26 tenacity, 27 zstandard, 28 29 # optional-dependencies 30 adlfs, 31 google-cloud-bigquery, 32 # bodo, 33 # daft, 34 duckdb, 35 pyarrow, 36 pyiceberg-core, 37 boto3, 38 huggingface-hub, 39 gcsfs, 40 thrift, 41 kerberos, 42 # thrift-sasl, 43 pandas, 44 # pyiceberg-core, 45 s3fs, 46 python-snappy, 47 psycopg2-binary, 48 sqlalchemy, 49 50 # tests 51 azure-core, 52 azure-storage-blob, 53 datafusion, 54 fastavro, 55 moto, 56 pyspark, 57 pytestCheckHook, 58 pytest-lazy-fixture, 59 pytest-mock, 60 pytest-timeout, 61 requests-mock, 62 pythonAtLeast, 63}: 64 65buildPythonPackage rec { 66 pname = "iceberg-python"; 67 version = "0.10.0"; 68 pyproject = true; 69 70 src = fetchFromGitHub { 71 owner = "apache"; 72 repo = "iceberg-python"; 73 tag = "pyiceberg-${version}"; 74 hash = "sha256-uR8nmKVjYjiArcNaf/Af2kGh14p59VV9g2mKPKmiJnc="; 75 }; 76 77 patches = [ 78 # Build script fails to build the cython extension on python 3.11 (no issues with python 3.12): 79 # distutils.errors.DistutilsSetupError: each element of 'ext_modules' option must be an Extension instance or 2-tuple 80 # This error vanishes if Cython and setuptools imports are swapped 81 # https://stackoverflow.com/a/53356077/11196710 82 ./reorder-imports-in-build-script.patch 83 ]; 84 85 build-system = [ 86 cython 87 poetry-core 88 setuptools 89 ]; 90 91 # Prevents the cython build to fail silently 92 env.CIBUILDWHEEL = "1"; 93 94 pythonRelaxDeps = [ 95 "cachetools" 96 "rich" 97 ]; 98 99 dependencies = [ 100 cachetools 101 click 102 fsspec 103 google-auth 104 mmh3 105 pydantic 106 pyparsing 107 pyroaring 108 ray 109 requests 110 rich 111 sortedcontainers 112 strictyaml 113 tenacity 114 zstandard 115 ]; 116 117 optional-dependencies = { 118 adlfs = [ 119 adlfs 120 ]; 121 bigquery = [ 122 google-cloud-bigquery 123 ]; 124 bodo = [ 125 # bodo 126 ]; 127 daft = [ 128 # daft 129 ]; 130 duckdb = [ 131 duckdb 132 pyarrow 133 ]; 134 dynamodb = [ 135 boto3 136 ]; 137 hf = [ 138 huggingface-hub 139 ]; 140 gcsfs = [ 141 gcsfs 142 ]; 143 glue = [ 144 boto3 145 ]; 146 hive = [ 147 thrift 148 ]; 149 hive-kerberos = [ 150 kerberos 151 thrift 152 # thrift-sasl 153 ]; 154 pandas = [ 155 pandas 156 pyarrow 157 ]; 158 pyarrow = [ 159 pyarrow 160 pyiceberg-core 161 ]; 162 ray = [ 163 pandas 164 pyarrow 165 ray 166 ]; 167 s3fs = [ 168 s3fs 169 ]; 170 snappy = [ 171 python-snappy 172 ]; 173 sql-postgres = [ 174 psycopg2-binary 175 sqlalchemy 176 ]; 177 sql-sqlite = [ 178 sqlalchemy 179 ]; 180 zstandard = [ 181 zstandard 182 ]; 183 }; 184 185 pythonImportsCheck = [ 186 "pyiceberg" 187 # Compiled avro decoder (cython) 188 "pyiceberg.avro.decoder_fast" 189 ]; 190 191 nativeCheckInputs = [ 192 azure-core 193 azure-storage-blob 194 boto3 195 datafusion 196 fastavro 197 moto 198 pyspark 199 pytest-lazy-fixture 200 pytest-mock 201 pytest-timeout 202 pytestCheckHook 203 requests-mock 204 ] 205 ++ optional-dependencies.bigquery 206 ++ optional-dependencies.hive 207 ++ optional-dependencies.pandas 208 ++ optional-dependencies.pyarrow 209 ++ optional-dependencies.s3fs 210 ++ optional-dependencies.sql-sqlite 211 ++ moto.optional-dependencies.server; 212 213 pytestFlags = [ 214 # ResourceWarning: unclosed database in <sqlite3.Connection object at 0x7ffe7c6f4220> 215 "-Wignore::pytest.PytestUnraisableExceptionWarning" 216 ]; 217 218 disabledTestPaths = [ 219 # Several errors: 220 # - FileNotFoundError: [Errno 2] No such file or directory: '/nix/store/...-python3.12-pyspark-3.5.3/lib/python3.12/site-packages/pyspark/./bin/spark-submit' 221 # - requests.exceptions.ConnectionError: HTTPConnectionPool(host='localhost', port=8181): Max retries exceeded with url: /v1/config 222 # - thrift.transport.TTransport.TTransportException: Could not connect to any of [('127.0.0.1', 9083)] 223 "tests/integration" 224 ]; 225 226 disabledTests = [ 227 # KeyError: 'authorization' 228 "test_token_200" 229 "test_token_200_without_optional_fields" 230 "test_token_with_default_scope" 231 "test_token_with_optional_oauth_params" 232 "test_token_with_custom_scope" 233 234 # AttributeError: 'SessionContext' object has no attribute 'register_table_provider' 235 "test_datafusion_register_pyiceberg_tabl" 236 237 # ModuleNotFoundError: No module named 'puresasl' 238 "test_create_hive_client_with_kerberos" 239 "test_create_hive_client_with_kerberos_using_context_manager" 240 241 # botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL 242 "test_checking_if_a_file_exists" 243 "test_closing_a_file" 244 "test_fsspec_file_tell" 245 "test_fsspec_getting_length_of_file" 246 "test_fsspec_pickle_round_trip_s3" 247 "test_fsspec_raise_on_opening_file_not_found" 248 "test_fsspec_read_specified_bytes_for_file" 249 "test_fsspec_write_and_read_file" 250 "test_writing_avro_file" 251 252 # Require unpackaged gcsfs 253 "test_fsspec_converting_an_outputfile_to_an_inputfile_gcs" 254 "test_fsspec_new_input_file_gcs" 255 "test_fsspec_new_output_file_gcs" 256 "test_fsspec_pickle_roundtrip_gcs" 257 258 # Timeout (network access) 259 "test_config_200" 260 "test_fsspec_converting_an_outputfile_to_an_inputfile_adls" 261 "test_fsspec_new_abfss_output_file_adls" 262 "test_fsspec_new_input_file_adls" 263 "test_fsspec_pickle_round_trip_aldfs" 264 "test_partitioned_write" 265 "test_token_200_w_oauth2_server_uri" 266 267 # azure.core.exceptions.ServiceRequestError (network access) 268 "test_converting_an_outputfile_to_an_inputfile_adls" 269 "test_file_tell_adls" 270 "test_getting_length_of_file_adls" 271 "test_new_input_file_adls" 272 "test_new_output_file_adls" 273 "test_raise_on_opening_file_not_found_adls" 274 "test_read_specified_bytes_for_file_adls" 275 "test_write_and_read_file_adls" 276 277 # Hangs forever (from tests/io/test_pyarrow.py) 278 "test_getting_length_of_file_gcs" 279 ] 280 ++ lib.optionals stdenv.hostPlatform.isDarwin [ 281 # ImportError: The pyarrow installation is not built with support for 'GcsFileSystem' 282 "test_converting_an_outputfile_to_an_inputfile_gcs" 283 "test_create_table_with_database_location" 284 "test_drop_table_with_database_location" 285 "test_new_input_file_gcs" 286 "test_new_output_file_gc" 287 288 # PermissionError: [Errno 13] Failed to open local file 289 # '/tmp/iceberg/warehouse/default.db/test_projection_partitions/metadata/00000-6c1c61a1-495f-45d3-903d-a2643431be91.metadata.json' 290 "test_identity_transform_column_projection" 291 "test_identity_transform_columns_projection" 292 "test_in_memory_catalog_context_manager" 293 "test_inspect_partition_for_nested_field" 294 ] 295 ++ lib.optionals (pythonAtLeast "3.13") [ 296 # AssertionError: 297 # assert "Incompatible with StructProtocol: <class 'str'>" in "Unable to initialize struct: <class 'str'>" 298 "test_read_not_struct_type" 299 ]; 300 301 __darwinAllowLocalNetworking = true; 302 303 meta = { 304 description = "Python library for programmatic access to Apache Iceberg"; 305 homepage = "https://github.com/apache/iceberg-python"; 306 changelog = "https://github.com/apache/iceberg-python/releases/tag/pyiceberg-${version}"; 307 license = lib.licenses.asl20; 308 maintainers = with lib.maintainers; [ GaetanLepage ]; 309 }; 310}