1{
2 lib,
3 buildPythonPackage,
4 fetchFromGitHub,
5 rustPlatform,
6
7 # nativeBuildInputs
8 protoc,
9
10 # buildInputs
11 protobuf,
12
13 # dependencies
14 pyarrow,
15 typing-extensions,
16
17 # tests
18 numpy,
19 pytest-asyncio,
20 pytestCheckHook,
21}:
22
23buildPythonPackage rec {
24 pname = "datafusion";
25 version = "49.0.0";
26 pyproject = true;
27
28 src = fetchFromGitHub {
29 name = "datafusion-source";
30 owner = "apache";
31 repo = "arrow-datafusion-python";
32 tag = version;
33 # Fetch arrow-testing and parquet-testing (tests assets)
34 fetchSubmodules = true;
35 hash = "sha256-U3LRZQMjL8sNa5yQmwfhw9NRGC0299TRODylzZkvFh4=";
36 };
37
38 cargoDeps = rustPlatform.fetchCargoVendor {
39 inherit pname src version;
40 hash = "sha256-lCbqy6kZK+LSLvr+Odxt167ACnDap2enH/J4ILcPtOc=";
41 };
42
43 nativeBuildInputs = with rustPlatform; [
44 cargoSetupHook
45 maturinBuildHook
46 protoc
47 ];
48
49 buildInputs = [
50 protobuf
51 ];
52
53 dependencies = [
54 pyarrow
55 typing-extensions
56 ];
57
58 nativeCheckInputs = [
59 numpy
60 pytest-asyncio
61 pytestCheckHook
62 ];
63
64 pythonImportsCheck = [
65 "datafusion"
66 "datafusion._internal"
67 ];
68
69 preCheck = ''
70 rm -rf python/datafusion
71 '';
72
73 disabledTests = [
74 # Exception: DataFusion error (requires internet access)
75 "test_register_http_csv"
76 ];
77
78 meta = {
79 description = "Extensible query execution framework";
80 longDescription = ''
81 DataFusion is an extensible query execution framework, written in Rust,
82 that uses Apache Arrow as its in-memory format.
83 '';
84 homepage = "https://arrow.apache.org/datafusion/";
85 changelog = "https://github.com/apache/arrow-datafusion-python/blob/${version}/CHANGELOG.md";
86 license = with lib.licenses; [ asl20 ];
87 maintainers = with lib.maintainers; [ cpcloud ];
88 };
89}