at master 1.7 kB view raw
1{ 2 lib, 3 buildPythonPackage, 4 fetchPypi, 5 numpy, 6 pandas, 7 py4j, 8 pyarrow, 9 pythonOlder, 10}: 11 12buildPythonPackage rec { 13 pname = "pyspark"; 14 version = "3.5.5"; 15 format = "setuptools"; 16 17 disabled = pythonOlder "3.7"; 18 19 src = fetchPypi { 20 inherit pname version; 21 hash = "sha256-bv/Jzpjt8jH01oP9FPcnBim/hFjGKNaiYg3tS7NPPLk="; 22 }; 23 24 # pypandoc is broken with pandoc2, so we just lose docs. 25 postPatch = '' 26 sed -i "s/'pypandoc'//" setup.py 27 28 substituteInPlace setup.py \ 29 --replace py4j== 'py4j>=' 30 ''; 31 32 postFixup = '' 33 # find_python_home.py has been wrapped as a shell script 34 substituteInPlace $out/bin/find-spark-home \ 35 --replace 'export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")' \ 36 'export SPARK_HOME=$("$FIND_SPARK_HOME_PYTHON_SCRIPT")' 37 # patch PYTHONPATH in pyspark so that it properly looks at SPARK_HOME 38 substituteInPlace $out/bin/pyspark \ 39 --replace 'export PYTHONPATH="''${SPARK_HOME}/python/:$PYTHONPATH"' \ 40 'export PYTHONPATH="''${SPARK_HOME}/..:''${SPARK_HOME}/python/:$PYTHONPATH"' 41 ''; 42 43 propagatedBuildInputs = [ py4j ]; 44 45 optional-dependencies = { 46 ml = [ numpy ]; 47 mllib = [ numpy ]; 48 sql = [ 49 numpy 50 pandas 51 pyarrow 52 ]; 53 }; 54 55 # Tests assume running spark instance 56 doCheck = false; 57 58 pythonImportsCheck = [ "pyspark" ]; 59 60 meta = with lib; { 61 description = "Python bindings for Apache Spark"; 62 homepage = "https://github.com/apache/spark/tree/master/python"; 63 sourceProvenance = with sourceTypes; [ 64 fromSource 65 binaryBytecode 66 ]; 67 license = licenses.asl20; 68 maintainers = with maintainers; [ shlevy ]; 69 }; 70}