at 25.11-pre 5.5 kB view raw
1import ./make-test-python.nix ( 2 { lib, pkgs, ... }: 3 let 4 slurmconfig = { 5 services.slurm = { 6 controlMachine = "control"; 7 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; 8 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; 9 extraConfig = '' 10 AccountingStorageHost=dbd 11 AccountingStorageType=accounting_storage/slurmdbd 12 ''; 13 }; 14 environment.systemPackages = [ mpitest ]; 15 networking.firewall.enable = false; 16 systemd.tmpfiles.rules = [ 17 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 18 ]; 19 }; 20 21 mpitest = 22 let 23 mpitestC = pkgs.writeText "mpitest.c" '' 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <mpi.h> 27 28 int 29 main (int argc, char *argv[]) 30 { 31 int rank, size, length; 32 char name[512]; 33 34 MPI_Init (&argc, &argv); 35 MPI_Comm_rank (MPI_COMM_WORLD, &rank); 36 MPI_Comm_size (MPI_COMM_WORLD, &size); 37 MPI_Get_processor_name (name, &length); 38 39 if ( rank == 0 ) printf("size=%d\n", size); 40 41 printf ("%s: hello world from process %d of %d\n", name, rank, size); 42 43 MPI_Finalize (); 44 45 return EXIT_SUCCESS; 46 } 47 ''; 48 in 49 pkgs.runCommand "mpitest" { } '' 50 mkdir -p $out/bin 51 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest 52 ''; 53 in 54 { 55 name = "slurm"; 56 57 meta.maintainers = [ lib.maintainers.markuskowa ]; 58 59 nodes = 60 let 61 computeNode = 62 { ... }: 63 { 64 imports = [ slurmconfig ]; 65 # TODO slurmd port and slurmctld port should be configurations and 66 # automatically allowed by the firewall. 67 services.slurm = { 68 client.enable = true; 69 }; 70 }; 71 in 72 { 73 74 control = 75 { ... }: 76 { 77 imports = [ slurmconfig ]; 78 services.slurm = { 79 server.enable = true; 80 }; 81 }; 82 83 submit = 84 { ... }: 85 { 86 imports = [ slurmconfig ]; 87 services.slurm = { 88 enableStools = true; 89 }; 90 }; 91 92 dbd = 93 { pkgs, ... }: 94 let 95 passFile = pkgs.writeText "dbdpassword" "password123"; 96 in 97 { 98 networking.firewall.enable = false; 99 systemd.tmpfiles.rules = [ 100 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 101 ]; 102 services.slurm.dbdserver = { 103 enable = true; 104 storagePassFile = "${passFile}"; 105 }; 106 services.mysql = { 107 enable = true; 108 package = pkgs.mariadb; 109 initialScript = pkgs.writeText "mysql-init.sql" '' 110 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; 111 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; 112 ''; 113 ensureDatabases = [ "slurm_acct_db" ]; 114 ensureUsers = [ 115 { 116 ensurePermissions = { 117 "slurm_acct_db.*" = "ALL PRIVILEGES"; 118 }; 119 name = "slurm"; 120 } 121 ]; 122 settings.mysqld = { 123 # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration 124 innodb_buffer_pool_size = "1024M"; 125 innodb_log_file_size = "64M"; 126 innodb_lock_wait_timeout = 900; 127 }; 128 }; 129 }; 130 131 node1 = computeNode; 132 node2 = computeNode; 133 node3 = computeNode; 134 }; 135 136 testScript = '' 137 start_all() 138 139 # Make sure DBD is up after DB initialzation 140 with subtest("can_start_slurmdbd"): 141 dbd.succeed("systemctl restart slurmdbd") 142 dbd.wait_for_unit("slurmdbd.service") 143 dbd.wait_for_open_port(6819) 144 145 # there needs to be an entry for the current 146 # cluster in the database before slurmctld is restarted 147 with subtest("add_account"): 148 control.succeed("sacctmgr -i add cluster default") 149 # check for cluster entry 150 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") 151 152 with subtest("can_start_slurmctld"): 153 control.succeed("systemctl restart slurmctld") 154 control.wait_for_unit("slurmctld.service") 155 156 with subtest("can_start_slurmd"): 157 for node in [node1, node2, node3]: 158 node.succeed("systemctl restart slurmd.service") 159 node.wait_for_unit("slurmd") 160 161 # Test that the cluster works and can distribute jobs; 162 163 with subtest("run_distributed_command"): 164 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). 165 # The output must contain the 3 different names 166 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") 167 168 with subtest("check_slurm_dbd"): 169 # find the srun job from above in the database 170 control.succeed("sleep 5") 171 control.succeed("sacct | grep hostname") 172 173 with subtest("run_PMIx_mpitest"): 174 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") 175 ''; 176 } 177)