commit 13839b0022fee66a1291792c47f6bc2b71b91895 · pyrox.dev/nixpkgs

+28

nixos/tests/spark/default.nix

···

       1
       1
       +
       import ../make-test-python.nix ({...}: {

     

       2
       2
       +
         name = "spark";

     

       3
       3
       +
       

     

       4
       4
       +
         nodes = {

     

       5
       5
       +
           worker = { nodes, pkgs, ... }: {

     

       6
       6
       +
             virtualisation.memorySize = 1024;

     

       7
       7
       +
             services.spark.worker = {

     

       8
       8
       +
               enable = true;

     

       9
       9
       +
               master = "master:7077";

     

       10
       10
       +
             };

     

       11
       11
       +
           };

     

       12
       12
       +
           master = { config, pkgs, ... }: {

     

       13
       13
       +
             services.spark.master = {

     

       14
       14
       +
               enable = true;

     

       15
       15
       +
               bind = "0.0.0.0";

     

       16
       16
       +
             };

     

       17
       17
       +
             networking.firewall.allowedTCPPorts = [ 22 7077 8080 ];

     

       18
       18
       +
           };

     

       19
       19
       +
         };

     

       20
       20
       +
       

     

       21
       21
       +
         testScript = ''

     

       22
       22
       +
           master.wait_for_unit("spark-master.service")

     

       23
       23
       +
           worker.wait_for_unit("spark-worker.service")

     

       24
       24
       +
           worker.copy_from_host( "${./spark_sample.py}", "/spark_sample.py" )

     

       25
       25
       +
           assert "<title>Spark Master at spark://" in worker.succeed("curl -sSfkL http://master:8080/")

     

       26
       26
       +
           worker.succeed("spark-submit --master spark://master:7077 --executor-memory 512m --executor-cores 1 /spark_sample.py")

     

       27
       27
       +
         '';

     

       28
       28
       +
       })

+40

nixos/tests/spark/spark_sample.py

···

       1
       1
       +
       from pyspark.sql import Row, SparkSession

     

       2
       2
       +
       from pyspark.sql import functions as F

     

       3
       3
       +
       from pyspark.sql.functions import udf

     

       4
       4
       +
       from pyspark.sql.types import *

     

       5
       5
       +
       from pyspark.sql.functions import explode

     

       6
       6
       +
       

     

       7
       7
       +
       def explode_col(weight):

     

       8
       8
       +
           return int(weight//10) * [10.0] + ([] if weight%10==0 else [weight%10])

     

       9
       9
       +
       

     

       10
       10
       +
       spark = SparkSession.builder.getOrCreate()

     

       11
       11
       +
       

     

       12
       12
       +
       dataSchema = [

     

       13
       13
       +
           StructField("feature_1", FloatType()),

     

       14
       14
       +
           StructField("feature_2", FloatType()),

     

       15
       15
       +
           StructField("bias_weight", FloatType())

     

       16
       16
       +
       ]

     

       17
       17
       +
       

     

       18
       18
       +
       data = [

     

       19
       19
       +
           Row(0.1, 0.2, 10.32),

     

       20
       20
       +
           Row(0.32, 1.43, 12.8),

     

       21
       21
       +
           Row(1.28, 1.12, 0.23)

     

       22
       22
       +
       ]

     

       23
       23
       +
       

     

       24
       24
       +
       df = spark.createDataFrame(spark.sparkContext.parallelize(data), StructType(dataSchema))

     

       25
       25
       +
       

     

       26
       26
       +
       normalizing_constant = 100

     

       27
       27
       +
       sum_bias_weight = df.select(F.sum('bias_weight')).collect()[0][0]

     

       28
       28
       +
       normalizing_factor = normalizing_constant / sum_bias_weight

     

       29
       29
       +
       df = df.withColumn('normalized_bias_weight', df.bias_weight * normalizing_factor)

     

       30
       30
       +
       df = df.drop('bias_weight')

     

       31
       31
       +
       df = df.withColumnRenamed('normalized_bias_weight', 'bias_weight')

     

       32
       32
       +
       

     

       33
       33
       +
       my_udf = udf(lambda x: explode_col(x), ArrayType(FloatType()))

     

       34
       34
       +
       df1 = df.withColumn('explode_val', my_udf(df.bias_weight))

     

       35
       35
       +
       df1 = df1.withColumn("explode_val_1", explode(df1.explode_val)).drop("explode_val")

     

       36
       36
       +
       df1 = df1.drop('bias_weight').withColumnRenamed('explode_val_1', 'bias_weight')

     

       37
       37
       +
       

     

       38
       38
       +
       df1.show()

     

       39
       39
       +
       

     

       40
       40
       +
       assert(df1.count() == 12)