1{ lib, pkgs, ... }:
2let
3 slurmconfig = {
4 services.slurm = {
5 controlMachine = "control";
6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
8 extraConfig = ''
9 AccountingStorageHost=dbd
10 AccountingStorageType=accounting_storage/slurmdbd
11 '';
12 };
13 environment.systemPackages = [ mpitest ];
14 networking.firewall.enable = false;
15 systemd.tmpfiles.rules = [
16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
17 ];
18 };
19
20 mpitest =
21 let
22 mpitestC = pkgs.writeText "mpitest.c" ''
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <mpi.h>
26
27 int
28 main (int argc, char *argv[])
29 {
30 int rank, size, length;
31 char name[512];
32
33 MPI_Init (&argc, &argv);
34 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
35 MPI_Comm_size (MPI_COMM_WORLD, &size);
36 MPI_Get_processor_name (name, &length);
37
38 if ( rank == 0 ) printf("size=%d\n", size);
39
40 printf ("%s: hello world from process %d of %d\n", name, rank, size);
41
42 MPI_Finalize ();
43
44 return EXIT_SUCCESS;
45 }
46 '';
47 in
48 pkgs.runCommand "mpitest" { } ''
49 mkdir -p $out/bin
50 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
51 '';
52
53 sbatchOutput = "/tmp/shared/sbatch.log";
54 sbatchScript = pkgs.writeText "sbatchScript" ''
55 #!${pkgs.runtimeShell}
56 #SBATCH --nodes 1
57 #SBATCH --ntasks 1
58 #SBATCH --output ${sbatchOutput}
59
60 echo "sbatch success"
61 '';
62in
63{
64 name = "slurm";
65
66 meta.maintainers = [ lib.maintainers.markuskowa ];
67
68 nodes =
69 let
70 computeNode =
71 { ... }:
72 {
73 imports = [ slurmconfig ];
74 # TODO slurmd port and slurmctld port should be configurations and
75 # automatically allowed by the firewall.
76 services.slurm = {
77 client.enable = true;
78 };
79 };
80 in
81 {
82
83 control =
84 { ... }:
85 {
86 imports = [ slurmconfig ];
87 services.slurm = {
88 server.enable = true;
89 };
90 };
91
92 submit =
93 { ... }:
94 {
95 imports = [ slurmconfig ];
96 services.slurm = {
97 enableStools = true;
98 };
99 };
100
101 dbd =
102 { pkgs, ... }:
103 let
104 passFile = pkgs.writeText "dbdpassword" "password123";
105 in
106 {
107 networking.firewall.enable = false;
108 systemd.tmpfiles.rules = [
109 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest"
110 ];
111 services.slurm.dbdserver = {
112 enable = true;
113 storagePassFile = "${passFile}";
114 };
115 services.mysql = {
116 enable = true;
117 package = pkgs.mariadb;
118 initialScript = pkgs.writeText "mysql-init.sql" ''
119 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
120 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
121 '';
122 ensureDatabases = [ "slurm_acct_db" ];
123 ensureUsers = [
124 {
125 ensurePermissions = {
126 "slurm_acct_db.*" = "ALL PRIVILEGES";
127 };
128 name = "slurm";
129 }
130 ];
131 };
132 };
133
134 node1 = computeNode;
135 node2 = computeNode;
136 node3 = computeNode;
137 };
138
139 testScript = ''
140 with subtest("can_start_slurmdbd"):
141 dbd.wait_for_unit("slurmdbd.service")
142 dbd.wait_for_open_port(6819)
143
144 with subtest("cluster_is_initialized"):
145 control.wait_for_unit("multi-user.target")
146 control.wait_for_unit("slurmctld.service")
147 control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default")
148
149 start_all()
150
151 with subtest("can_start_slurmd"):
152 for node in [node1, node2, node3]:
153 node.wait_for_unit("slurmd")
154
155 # Test that the cluster works and can distribute jobs;
156 submit.wait_for_unit("multi-user.target")
157
158 with subtest("run_distributed_command"):
159 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
160 # The output must contain the 3 different names
161 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
162
163 with subtest("check_slurm_dbd_job"):
164 # find the srun job from above in the database
165 control.wait_until_succeeds("sacct | grep hostname")
166
167 with subtest("run_PMIx_mpitest"):
168 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
169
170 with subtest("run_sbatch"):
171 submit.succeed("sbatch --wait ${sbatchScript}")
172 submit.succeed("grep 'sbatch success' ${sbatchOutput}")
173 '';
174}