at master 4.9 kB view raw
1{ lib, pkgs, ... }: 2let 3 slurmconfig = { 4 services.slurm = { 5 controlMachine = "control"; 6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; 7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; 8 extraConfig = '' 9 AccountingStorageHost=dbd 10 AccountingStorageType=accounting_storage/slurmdbd 11 ''; 12 }; 13 environment.systemPackages = [ mpitest ]; 14 networking.firewall.enable = false; 15 systemd.tmpfiles.rules = [ 16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 17 ]; 18 }; 19 20 mpitest = 21 let 22 mpitestC = pkgs.writeText "mpitest.c" '' 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <mpi.h> 26 27 int 28 main (int argc, char *argv[]) 29 { 30 int rank, size, length; 31 char name[512]; 32 33 MPI_Init (&argc, &argv); 34 MPI_Comm_rank (MPI_COMM_WORLD, &rank); 35 MPI_Comm_size (MPI_COMM_WORLD, &size); 36 MPI_Get_processor_name (name, &length); 37 38 if ( rank == 0 ) printf("size=%d\n", size); 39 40 printf ("%s: hello world from process %d of %d\n", name, rank, size); 41 42 MPI_Finalize (); 43 44 return EXIT_SUCCESS; 45 } 46 ''; 47 in 48 pkgs.runCommand "mpitest" { } '' 49 mkdir -p $out/bin 50 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest 51 ''; 52 53 sbatchOutput = "/tmp/shared/sbatch.log"; 54 sbatchScript = pkgs.writeText "sbatchScript" '' 55 #!${pkgs.runtimeShell} 56 #SBATCH --nodes 1 57 #SBATCH --ntasks 1 58 #SBATCH --output ${sbatchOutput} 59 60 echo "sbatch success" 61 ''; 62in 63{ 64 name = "slurm"; 65 66 meta.maintainers = [ lib.maintainers.markuskowa ]; 67 68 nodes = 69 let 70 computeNode = 71 { ... }: 72 { 73 imports = [ slurmconfig ]; 74 # TODO slurmd port and slurmctld port should be configurations and 75 # automatically allowed by the firewall. 76 services.slurm = { 77 client.enable = true; 78 }; 79 }; 80 in 81 { 82 83 control = 84 { ... }: 85 { 86 imports = [ slurmconfig ]; 87 services.slurm = { 88 server.enable = true; 89 }; 90 }; 91 92 submit = 93 { ... }: 94 { 95 imports = [ slurmconfig ]; 96 services.slurm = { 97 enableStools = true; 98 }; 99 }; 100 101 dbd = 102 { pkgs, ... }: 103 let 104 passFile = pkgs.writeText "dbdpassword" "password123"; 105 in 106 { 107 networking.firewall.enable = false; 108 systemd.tmpfiles.rules = [ 109 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 110 ]; 111 services.slurm.dbdserver = { 112 enable = true; 113 storagePassFile = "${passFile}"; 114 }; 115 services.mysql = { 116 enable = true; 117 package = pkgs.mariadb; 118 initialScript = pkgs.writeText "mysql-init.sql" '' 119 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; 120 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; 121 ''; 122 ensureDatabases = [ "slurm_acct_db" ]; 123 ensureUsers = [ 124 { 125 ensurePermissions = { 126 "slurm_acct_db.*" = "ALL PRIVILEGES"; 127 }; 128 name = "slurm"; 129 } 130 ]; 131 }; 132 }; 133 134 node1 = computeNode; 135 node2 = computeNode; 136 node3 = computeNode; 137 }; 138 139 testScript = '' 140 with subtest("can_start_slurmdbd"): 141 dbd.wait_for_unit("slurmdbd.service") 142 dbd.wait_for_open_port(6819) 143 144 with subtest("cluster_is_initialized"): 145 control.wait_for_unit("multi-user.target") 146 control.wait_for_unit("slurmctld.service") 147 control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default") 148 149 start_all() 150 151 with subtest("can_start_slurmd"): 152 for node in [node1, node2, node3]: 153 node.wait_for_unit("slurmd") 154 155 # Test that the cluster works and can distribute jobs; 156 submit.wait_for_unit("multi-user.target") 157 158 with subtest("run_distributed_command"): 159 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). 160 # The output must contain the 3 different names 161 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") 162 163 with subtest("check_slurm_dbd_job"): 164 # find the srun job from above in the database 165 control.wait_until_succeeds("sacct | grep hostname") 166 167 with subtest("run_PMIx_mpitest"): 168 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") 169 170 with subtest("run_sbatch"): 171 submit.succeed("sbatch --wait ${sbatchScript}") 172 submit.succeed("grep 'sbatch success' ${sbatchOutput}") 173 ''; 174}