1import ./make-test-python.nix (
2 { lib, pkgs, ... }:
3 let
4 slurmconfig = {
5 services.slurm = {
6 controlMachine = "control";
7 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
8 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
9 extraConfig = ''
10 AccountingStorageHost=dbd
11 AccountingStorageType=accounting_storage/slurmdbd
12 '';
13 };
14 environment.systemPackages = [ mpitest ];
15 networking.firewall.enable = false;
16 systemd.tmpfiles.rules = [
17 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
18 ];
19 };
20
21 mpitest =
22 let
23 mpitestC = pkgs.writeText "mpitest.c" ''
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <mpi.h>
27
28 int
29 main (int argc, char *argv[])
30 {
31 int rank, size, length;
32 char name[512];
33
34 MPI_Init (&argc, &argv);
35 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
36 MPI_Comm_size (MPI_COMM_WORLD, &size);
37 MPI_Get_processor_name (name, &length);
38
39 if ( rank == 0 ) printf("size=%d\n", size);
40
41 printf ("%s: hello world from process %d of %d\n", name, rank, size);
42
43 MPI_Finalize ();
44
45 return EXIT_SUCCESS;
46 }
47 '';
48 in
49 pkgs.runCommand "mpitest" { } ''
50 mkdir -p $out/bin
51 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
52 '';
53 in
54 {
55 name = "slurm";
56
57 meta.maintainers = [ lib.maintainers.markuskowa ];
58
59 nodes =
60 let
61 computeNode =
62 { ... }:
63 {
64 imports = [ slurmconfig ];
65 # TODO slurmd port and slurmctld port should be configurations and
66 # automatically allowed by the firewall.
67 services.slurm = {
68 client.enable = true;
69 };
70 };
71 in
72 {
73
74 control =
75 { ... }:
76 {
77 imports = [ slurmconfig ];
78 services.slurm = {
79 server.enable = true;
80 };
81 };
82
83 submit =
84 { ... }:
85 {
86 imports = [ slurmconfig ];
87 services.slurm = {
88 enableStools = true;
89 };
90 };
91
92 dbd =
93 { pkgs, ... }:
94 let
95 passFile = pkgs.writeText "dbdpassword" "password123";
96 in
97 {
98 networking.firewall.enable = false;
99 systemd.tmpfiles.rules = [
100 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
101 ];
102 services.slurm.dbdserver = {
103 enable = true;
104 storagePassFile = "${passFile}";
105 };
106 services.mysql = {
107 enable = true;
108 package = pkgs.mariadb;
109 initialScript = pkgs.writeText "mysql-init.sql" ''
110 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
111 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
112 '';
113 ensureDatabases = [ "slurm_acct_db" ];
114 ensureUsers = [
115 {
116 ensurePermissions = {
117 "slurm_acct_db.*" = "ALL PRIVILEGES";
118 };
119 name = "slurm";
120 }
121 ];
122 settings.mysqld = {
123 # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
124 innodb_buffer_pool_size = "1024M";
125 innodb_log_file_size = "64M";
126 innodb_lock_wait_timeout = 900;
127 };
128 };
129 };
130
131 node1 = computeNode;
132 node2 = computeNode;
133 node3 = computeNode;
134 };
135
136 testScript = ''
137 start_all()
138
139 # Make sure DBD is up after DB initialzation
140 with subtest("can_start_slurmdbd"):
141 dbd.succeed("systemctl restart slurmdbd")
142 dbd.wait_for_unit("slurmdbd.service")
143 dbd.wait_for_open_port(6819)
144
145 # there needs to be an entry for the current
146 # cluster in the database before slurmctld is restarted
147 with subtest("add_account"):
148 control.succeed("sacctmgr -i add cluster default")
149 # check for cluster entry
150 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
151
152 with subtest("can_start_slurmctld"):
153 control.succeed("systemctl restart slurmctld")
154 control.wait_for_unit("slurmctld.service")
155
156 with subtest("can_start_slurmd"):
157 for node in [node1, node2, node3]:
158 node.succeed("systemctl restart slurmd.service")
159 node.wait_for_unit("slurmd")
160
161 # Test that the cluster works and can distribute jobs;
162
163 with subtest("run_distributed_command"):
164 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
165 # The output must contain the 3 different names
166 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
167
168 with subtest("check_slurm_dbd"):
169 # find the srun job from above in the database
170 control.succeed("sleep 5")
171 control.succeed("sacct | grep hostname")
172
173 with subtest("run_PMIx_mpitest"):
174 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
175 '';
176 }
177)