1import ./make-test-python.nix ({ lib, pkgs, ... }:
2let
3 slurmconfig = {
4 services.slurm = {
5 controlMachine = "control";
6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
8 extraConfig = ''
9 AccountingStorageHost=dbd
10 AccountingStorageType=accounting_storage/slurmdbd
11 '';
12 };
13 environment.systemPackages = [ mpitest ];
14 networking.firewall.enable = false;
15 systemd.tmpfiles.rules = [
16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
17 ];
18 };
19
20 mpitest = let
21 mpitestC = pkgs.writeText "mpitest.c" ''
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <mpi.h>
25
26 int
27 main (int argc, char *argv[])
28 {
29 int rank, size, length;
30 char name[512];
31
32 MPI_Init (&argc, &argv);
33 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
34 MPI_Comm_size (MPI_COMM_WORLD, &size);
35 MPI_Get_processor_name (name, &length);
36
37 if ( rank == 0 ) printf("size=%d\n", size);
38
39 printf ("%s: hello world from process %d of %d\n", name, rank, size);
40
41 MPI_Finalize ();
42
43 return EXIT_SUCCESS;
44 }
45 '';
46 in pkgs.runCommand "mpitest" {} ''
47 mkdir -p $out/bin
48 ${pkgs.openmpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
49 '';
50in {
51 name = "slurm";
52
53 meta.maintainers = [ lib.maintainers.markuskowa ];
54
55 nodes =
56 let
57 computeNode =
58 { ...}:
59 {
60 imports = [ slurmconfig ];
61 # TODO slurmd port and slurmctld port should be configurations and
62 # automatically allowed by the firewall.
63 services.slurm = {
64 client.enable = true;
65 };
66 };
67 in {
68
69 control =
70 { ...}:
71 {
72 imports = [ slurmconfig ];
73 services.slurm = {
74 server.enable = true;
75 };
76 };
77
78 submit =
79 { ...}:
80 {
81 imports = [ slurmconfig ];
82 services.slurm = {
83 enableStools = true;
84 };
85 };
86
87 dbd =
88 { pkgs, ... } :
89 let
90 passFile = pkgs.writeText "dbdpassword" "password123";
91 in {
92 networking.firewall.enable = false;
93 systemd.tmpfiles.rules = [
94 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
95 ];
96 services.slurm.dbdserver = {
97 enable = true;
98 storagePassFile = "${passFile}";
99 };
100 services.mysql = {
101 enable = true;
102 package = pkgs.mariadb;
103 initialScript = pkgs.writeText "mysql-init.sql" ''
104 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
105 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
106 '';
107 ensureDatabases = [ "slurm_acct_db" ];
108 ensureUsers = [{
109 ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
110 name = "slurm";
111 }];
112 settings.mysqld = {
113 # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
114 innodb_buffer_pool_size="1024M";
115 innodb_log_file_size="64M";
116 innodb_lock_wait_timeout=900;
117 };
118 };
119 };
120
121 node1 = computeNode;
122 node2 = computeNode;
123 node3 = computeNode;
124 };
125
126
127 testScript =
128 ''
129 start_all()
130
131 # Make sure DBD is up after DB initialzation
132 with subtest("can_start_slurmdbd"):
133 dbd.succeed("systemctl restart slurmdbd")
134 dbd.wait_for_unit("slurmdbd.service")
135 dbd.wait_for_open_port(6819)
136
137 # there needs to be an entry for the current
138 # cluster in the database before slurmctld is restarted
139 with subtest("add_account"):
140 control.succeed("sacctmgr -i add cluster default")
141 # check for cluster entry
142 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
143
144 with subtest("can_start_slurmctld"):
145 control.succeed("systemctl restart slurmctld")
146 control.wait_for_unit("slurmctld.service")
147
148 with subtest("can_start_slurmd"):
149 for node in [node1, node2, node3]:
150 node.succeed("systemctl restart slurmd.service")
151 node.wait_for_unit("slurmd")
152
153 # Test that the cluster works and can distribute jobs;
154
155 with subtest("run_distributed_command"):
156 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
157 # The output must contain the 3 different names
158 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
159
160 with subtest("check_slurm_dbd"):
161 # find the srun job from above in the database
162 control.succeed("sleep 5")
163 control.succeed("sacct | grep hostname")
164
165 with subtest("run_PMIx_mpitest"):
166 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
167 '';
168})