1{
2 lib,
3 buildPythonPackage,
4 fetchPypi,
5 numpy,
6 pandas,
7 py4j,
8 pyarrow,
9 pythonOlder,
10}:
11
12buildPythonPackage rec {
13 pname = "pyspark";
14 version = "3.5.5";
15 format = "setuptools";
16
17 disabled = pythonOlder "3.7";
18
19 src = fetchPypi {
20 inherit pname version;
21 hash = "sha256-bv/Jzpjt8jH01oP9FPcnBim/hFjGKNaiYg3tS7NPPLk=";
22 };
23
24 # pypandoc is broken with pandoc2, so we just lose docs.
25 postPatch = ''
26 sed -i "s/'pypandoc'//" setup.py
27
28 substituteInPlace setup.py \
29 --replace py4j== 'py4j>='
30 '';
31
32 postFixup = ''
33 # find_python_home.py has been wrapped as a shell script
34 substituteInPlace $out/bin/find-spark-home \
35 --replace 'export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")' \
36 'export SPARK_HOME=$("$FIND_SPARK_HOME_PYTHON_SCRIPT")'
37 # patch PYTHONPATH in pyspark so that it properly looks at SPARK_HOME
38 substituteInPlace $out/bin/pyspark \
39 --replace 'export PYTHONPATH="''${SPARK_HOME}/python/:$PYTHONPATH"' \
40 'export PYTHONPATH="''${SPARK_HOME}/..:''${SPARK_HOME}/python/:$PYTHONPATH"'
41 '';
42
43 propagatedBuildInputs = [ py4j ];
44
45 optional-dependencies = {
46 ml = [ numpy ];
47 mllib = [ numpy ];
48 sql = [
49 numpy
50 pandas
51 pyarrow
52 ];
53 };
54
55 # Tests assume running spark instance
56 doCheck = false;
57
58 pythonImportsCheck = [ "pyspark" ];
59
60 meta = with lib; {
61 description = "Python bindings for Apache Spark";
62 homepage = "https://github.com/apache/spark/tree/master/python";
63 sourceProvenance = with sourceTypes; [
64 fromSource
65 binaryBytecode
66 ];
67 license = licenses.asl20;
68 maintainers = with maintainers; [ shlevy ];
69 };
70}