1{
2 lib,
3 stdenv,
4 buildPythonPackage,
5 fetchFromGitHub,
6
7 # build-system
8 cython,
9 poetry-core,
10 setuptools,
11
12 # dependencies
13 cachetools,
14 click,
15 fsspec,
16 google-auth,
17 mmh3,
18 pydantic,
19 pyparsing,
20 pyroaring,
21 ray,
22 requests,
23 rich,
24 sortedcontainers,
25 strictyaml,
26 tenacity,
27 zstandard,
28
29 # optional-dependencies
30 adlfs,
31 google-cloud-bigquery,
32 # bodo,
33 # daft,
34 duckdb,
35 pyarrow,
36 pyiceberg-core,
37 boto3,
38 huggingface-hub,
39 gcsfs,
40 thrift,
41 kerberos,
42 # thrift-sasl,
43 pandas,
44 # pyiceberg-core,
45 s3fs,
46 python-snappy,
47 psycopg2-binary,
48 sqlalchemy,
49
50 # tests
51 azure-core,
52 azure-storage-blob,
53 datafusion,
54 fastavro,
55 moto,
56 pyspark,
57 pytestCheckHook,
58 pytest-lazy-fixture,
59 pytest-mock,
60 pytest-timeout,
61 requests-mock,
62 pythonAtLeast,
63}:
64
65buildPythonPackage rec {
66 pname = "iceberg-python";
67 version = "0.10.0";
68 pyproject = true;
69
70 src = fetchFromGitHub {
71 owner = "apache";
72 repo = "iceberg-python";
73 tag = "pyiceberg-${version}";
74 hash = "sha256-uR8nmKVjYjiArcNaf/Af2kGh14p59VV9g2mKPKmiJnc=";
75 };
76
77 patches = [
78 # Build script fails to build the cython extension on python 3.11 (no issues with python 3.12):
79 # distutils.errors.DistutilsSetupError: each element of 'ext_modules' option must be an Extension instance or 2-tuple
80 # This error vanishes if Cython and setuptools imports are swapped
81 # https://stackoverflow.com/a/53356077/11196710
82 ./reorder-imports-in-build-script.patch
83 ];
84
85 build-system = [
86 cython
87 poetry-core
88 setuptools
89 ];
90
91 # Prevents the cython build to fail silently
92 env.CIBUILDWHEEL = "1";
93
94 pythonRelaxDeps = [
95 "cachetools"
96 "rich"
97 ];
98
99 dependencies = [
100 cachetools
101 click
102 fsspec
103 google-auth
104 mmh3
105 pydantic
106 pyparsing
107 pyroaring
108 ray
109 requests
110 rich
111 sortedcontainers
112 strictyaml
113 tenacity
114 zstandard
115 ];
116
117 optional-dependencies = {
118 adlfs = [
119 adlfs
120 ];
121 bigquery = [
122 google-cloud-bigquery
123 ];
124 bodo = [
125 # bodo
126 ];
127 daft = [
128 # daft
129 ];
130 duckdb = [
131 duckdb
132 pyarrow
133 ];
134 dynamodb = [
135 boto3
136 ];
137 hf = [
138 huggingface-hub
139 ];
140 gcsfs = [
141 gcsfs
142 ];
143 glue = [
144 boto3
145 ];
146 hive = [
147 thrift
148 ];
149 hive-kerberos = [
150 kerberos
151 thrift
152 # thrift-sasl
153 ];
154 pandas = [
155 pandas
156 pyarrow
157 ];
158 pyarrow = [
159 pyarrow
160 pyiceberg-core
161 ];
162 ray = [
163 pandas
164 pyarrow
165 ray
166 ];
167 s3fs = [
168 s3fs
169 ];
170 snappy = [
171 python-snappy
172 ];
173 sql-postgres = [
174 psycopg2-binary
175 sqlalchemy
176 ];
177 sql-sqlite = [
178 sqlalchemy
179 ];
180 zstandard = [
181 zstandard
182 ];
183 };
184
185 pythonImportsCheck = [
186 "pyiceberg"
187 # Compiled avro decoder (cython)
188 "pyiceberg.avro.decoder_fast"
189 ];
190
191 nativeCheckInputs = [
192 azure-core
193 azure-storage-blob
194 boto3
195 datafusion
196 fastavro
197 moto
198 pyspark
199 pytest-lazy-fixture
200 pytest-mock
201 pytest-timeout
202 pytestCheckHook
203 requests-mock
204 ]
205 ++ optional-dependencies.bigquery
206 ++ optional-dependencies.hive
207 ++ optional-dependencies.pandas
208 ++ optional-dependencies.pyarrow
209 ++ optional-dependencies.s3fs
210 ++ optional-dependencies.sql-sqlite
211 ++ moto.optional-dependencies.server;
212
213 pytestFlags = [
214 # ResourceWarning: unclosed database in <sqlite3.Connection object at 0x7ffe7c6f4220>
215 "-Wignore::pytest.PytestUnraisableExceptionWarning"
216 ];
217
218 disabledTestPaths = [
219 # Several errors:
220 # - FileNotFoundError: [Errno 2] No such file or directory: '/nix/store/...-python3.12-pyspark-3.5.3/lib/python3.12/site-packages/pyspark/./bin/spark-submit'
221 # - requests.exceptions.ConnectionError: HTTPConnectionPool(host='localhost', port=8181): Max retries exceeded with url: /v1/config
222 # - thrift.transport.TTransport.TTransportException: Could not connect to any of [('127.0.0.1', 9083)]
223 "tests/integration"
224 ];
225
226 disabledTests = [
227 # KeyError: 'authorization'
228 "test_token_200"
229 "test_token_200_without_optional_fields"
230 "test_token_with_default_scope"
231 "test_token_with_optional_oauth_params"
232 "test_token_with_custom_scope"
233
234 # AttributeError: 'SessionContext' object has no attribute 'register_table_provider'
235 "test_datafusion_register_pyiceberg_tabl"
236
237 # ModuleNotFoundError: No module named 'puresasl'
238 "test_create_hive_client_with_kerberos"
239 "test_create_hive_client_with_kerberos_using_context_manager"
240
241 # botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL
242 "test_checking_if_a_file_exists"
243 "test_closing_a_file"
244 "test_fsspec_file_tell"
245 "test_fsspec_getting_length_of_file"
246 "test_fsspec_pickle_round_trip_s3"
247 "test_fsspec_raise_on_opening_file_not_found"
248 "test_fsspec_read_specified_bytes_for_file"
249 "test_fsspec_write_and_read_file"
250 "test_writing_avro_file"
251
252 # Require unpackaged gcsfs
253 "test_fsspec_converting_an_outputfile_to_an_inputfile_gcs"
254 "test_fsspec_new_input_file_gcs"
255 "test_fsspec_new_output_file_gcs"
256 "test_fsspec_pickle_roundtrip_gcs"
257
258 # Timeout (network access)
259 "test_config_200"
260 "test_fsspec_converting_an_outputfile_to_an_inputfile_adls"
261 "test_fsspec_new_abfss_output_file_adls"
262 "test_fsspec_new_input_file_adls"
263 "test_fsspec_pickle_round_trip_aldfs"
264 "test_partitioned_write"
265 "test_token_200_w_oauth2_server_uri"
266
267 # azure.core.exceptions.ServiceRequestError (network access)
268 "test_converting_an_outputfile_to_an_inputfile_adls"
269 "test_file_tell_adls"
270 "test_getting_length_of_file_adls"
271 "test_new_input_file_adls"
272 "test_new_output_file_adls"
273 "test_raise_on_opening_file_not_found_adls"
274 "test_read_specified_bytes_for_file_adls"
275 "test_write_and_read_file_adls"
276
277 # Hangs forever (from tests/io/test_pyarrow.py)
278 "test_getting_length_of_file_gcs"
279 ]
280 ++ lib.optionals stdenv.hostPlatform.isDarwin [
281 # ImportError: The pyarrow installation is not built with support for 'GcsFileSystem'
282 "test_converting_an_outputfile_to_an_inputfile_gcs"
283 "test_create_table_with_database_location"
284 "test_drop_table_with_database_location"
285 "test_new_input_file_gcs"
286 "test_new_output_file_gc"
287
288 # PermissionError: [Errno 13] Failed to open local file
289 # '/tmp/iceberg/warehouse/default.db/test_projection_partitions/metadata/00000-6c1c61a1-495f-45d3-903d-a2643431be91.metadata.json'
290 "test_identity_transform_column_projection"
291 "test_identity_transform_columns_projection"
292 "test_in_memory_catalog_context_manager"
293 "test_inspect_partition_for_nested_field"
294 ]
295 ++ lib.optionals (pythonAtLeast "3.13") [
296 # AssertionError:
297 # assert "Incompatible with StructProtocol: <class 'str'>" in "Unable to initialize struct: <class 'str'>"
298 "test_read_not_struct_type"
299 ];
300
301 __darwinAllowLocalNetworking = true;
302
303 meta = {
304 description = "Python library for programmatic access to Apache Iceberg";
305 homepage = "https://github.com/apache/iceberg-python";
306 changelog = "https://github.com/apache/iceberg-python/releases/tag/pyiceberg-${version}";
307 license = lib.licenses.asl20;
308 maintainers = with lib.maintainers; [ GaetanLepage ];
309 };
310}