at 23.05-pre 4.9 kB view raw
1import ./make-test-python.nix ({ lib, pkgs, ... }: 2let 3 slurmconfig = { 4 services.slurm = { 5 controlMachine = "control"; 6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; 7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; 8 extraConfig = '' 9 AccountingStorageHost=dbd 10 AccountingStorageType=accounting_storage/slurmdbd 11 ''; 12 }; 13 environment.systemPackages = [ mpitest ]; 14 networking.firewall.enable = false; 15 systemd.tmpfiles.rules = [ 16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 17 ]; 18 }; 19 20 mpitest = let 21 mpitestC = pkgs.writeText "mpitest.c" '' 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <mpi.h> 25 26 int 27 main (int argc, char *argv[]) 28 { 29 int rank, size, length; 30 char name[512]; 31 32 MPI_Init (&argc, &argv); 33 MPI_Comm_rank (MPI_COMM_WORLD, &rank); 34 MPI_Comm_size (MPI_COMM_WORLD, &size); 35 MPI_Get_processor_name (name, &length); 36 37 if ( rank == 0 ) printf("size=%d\n", size); 38 39 printf ("%s: hello world from process %d of %d\n", name, rank, size); 40 41 MPI_Finalize (); 42 43 return EXIT_SUCCESS; 44 } 45 ''; 46 in pkgs.runCommand "mpitest" {} '' 47 mkdir -p $out/bin 48 ${pkgs.openmpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest 49 ''; 50in { 51 name = "slurm"; 52 53 meta.maintainers = [ lib.maintainers.markuskowa ]; 54 55 nodes = 56 let 57 computeNode = 58 { ...}: 59 { 60 imports = [ slurmconfig ]; 61 # TODO slurmd port and slurmctld port should be configurations and 62 # automatically allowed by the firewall. 63 services.slurm = { 64 client.enable = true; 65 }; 66 }; 67 in { 68 69 control = 70 { ...}: 71 { 72 imports = [ slurmconfig ]; 73 services.slurm = { 74 server.enable = true; 75 }; 76 }; 77 78 submit = 79 { ...}: 80 { 81 imports = [ slurmconfig ]; 82 services.slurm = { 83 enableStools = true; 84 }; 85 }; 86 87 dbd = 88 { pkgs, ... } : 89 let 90 passFile = pkgs.writeText "dbdpassword" "password123"; 91 in { 92 networking.firewall.enable = false; 93 systemd.tmpfiles.rules = [ 94 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 95 ]; 96 services.slurm.dbdserver = { 97 enable = true; 98 storagePassFile = "${passFile}"; 99 }; 100 services.mysql = { 101 enable = true; 102 package = pkgs.mariadb; 103 initialScript = pkgs.writeText "mysql-init.sql" '' 104 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; 105 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; 106 ''; 107 ensureDatabases = [ "slurm_acct_db" ]; 108 ensureUsers = [{ 109 ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; 110 name = "slurm"; 111 }]; 112 settings.mysqld = { 113 # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration 114 innodb_buffer_pool_size="1024M"; 115 innodb_log_file_size="64M"; 116 innodb_lock_wait_timeout=900; 117 }; 118 }; 119 }; 120 121 node1 = computeNode; 122 node2 = computeNode; 123 node3 = computeNode; 124 }; 125 126 127 testScript = 128 '' 129 start_all() 130 131 # Make sure DBD is up after DB initialzation 132 with subtest("can_start_slurmdbd"): 133 dbd.succeed("systemctl restart slurmdbd") 134 dbd.wait_for_unit("slurmdbd.service") 135 dbd.wait_for_open_port(6819) 136 137 # there needs to be an entry for the current 138 # cluster in the database before slurmctld is restarted 139 with subtest("add_account"): 140 control.succeed("sacctmgr -i add cluster default") 141 # check for cluster entry 142 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") 143 144 with subtest("can_start_slurmctld"): 145 control.succeed("systemctl restart slurmctld") 146 control.wait_for_unit("slurmctld.service") 147 148 with subtest("can_start_slurmd"): 149 for node in [node1, node2, node3]: 150 node.succeed("systemctl restart slurmd.service") 151 node.wait_for_unit("slurmd") 152 153 # Test that the cluster works and can distribute jobs; 154 155 with subtest("run_distributed_command"): 156 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). 157 # The output must contain the 3 different names 158 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") 159 160 with subtest("check_slurm_dbd"): 161 # find the srun job from above in the database 162 control.succeed("sleep 5") 163 control.succeed("sacct | grep hostname") 164 165 with subtest("run_PMIx_mpitest"): 166 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") 167 ''; 168})