1{ lib, ... }:
2
3let
4 # Settings for both servers and agents
5 webUi = true;
6 retry_interval = "1s";
7 raft_multiplier = 1;
8
9 defaultExtraConfig = {
10 inherit retry_interval;
11 performance = {
12 inherit raft_multiplier;
13 };
14 };
15
16 allConsensusServerHosts = [
17 "192.168.1.1"
18 "192.168.1.2"
19 "192.168.1.3"
20 ];
21
22 allConsensusClientHosts = [
23 "192.168.2.1"
24 "192.168.2.2"
25 ];
26
27 firewallSettings = {
28 # See https://www.consul.io/docs/install/ports.html
29 allowedTCPPorts = [
30 8301
31 8302
32 8600
33 8500
34 8300
35 ];
36 allowedUDPPorts = [
37 8301
38 8302
39 8600
40 ];
41 };
42
43 client =
44 index:
45 { pkgs, ... }:
46 let
47 ip = builtins.elemAt allConsensusClientHosts index;
48 in
49 {
50 environment.systemPackages = [ pkgs.consul ];
51
52 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
53 {
54 address = ip;
55 prefixLength = 16;
56 }
57 ];
58 networking.firewall = firewallSettings;
59
60 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
61
62 services.consul = {
63 enable = true;
64 inherit webUi;
65 extraConfig = defaultExtraConfig // {
66 server = false;
67 retry_join = allConsensusServerHosts;
68 bind_addr = ip;
69 };
70 };
71 };
72
73 server =
74 index:
75 { pkgs, ... }:
76 let
77 numConsensusServers = builtins.length allConsensusServerHosts;
78 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
79 ip = thisConsensusServerHost; # since we already use IPs to identify servers
80 in
81 {
82 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
83 {
84 address = ip;
85 prefixLength = 16;
86 }
87 ];
88 networking.firewall = firewallSettings;
89
90 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
91
92 services.consul =
93 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
94 {
95 enable = true;
96 inherit webUi;
97 extraConfig = defaultExtraConfig // {
98 server = true;
99 bootstrap_expect = numConsensusServers;
100 # Tell Consul that we never intend to drop below this many servers.
101 # Ensures to not permanently lose consensus after temporary loss.
102 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
103 autopilot.min_quorum = numConsensusServers;
104 retry_join =
105 # If there's only 1 node in the network, we allow self-join;
106 # otherwise, the node must not try to join itself, and join only the other servers.
107 # See https://github.com/hashicorp/consul/issues/2868
108 if numConsensusServers == 1 then
109 allConsensusServerHosts
110 else
111 builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
112 bind_addr = ip;
113 };
114 };
115 };
116in
117{
118 name = "consul";
119
120 node.pkgsReadOnly = false;
121
122 nodes = {
123 server1 = server 0;
124 server2 = server 1;
125 server3 = server 2;
126
127 client1 = client 0;
128 client2 = client 1;
129 };
130
131 testScript = ''
132 servers = [server1, server2, server3]
133 machines = [server1, server2, server3, client1, client2]
134
135 for m in machines:
136 m.wait_for_unit("consul.service")
137
138
139 def wait_for_healthy_servers():
140 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
141 # for why the `Voter` column of `list-peers` has that info.
142 # TODO: The `grep true` relies on the fact that currently in
143 # the output like
144 # # consul operator raft list-peers
145 # Node ID Address State Voter RaftProtocol
146 # server3 ... 192.168.1.3:8300 leader true 3
147 # server2 ... 192.168.1.2:8300 follower true 3
148 # server1 ... 192.168.1.1:8300 follower false 3
149 # `Voter`is the only boolean column.
150 # Change this to the more reliable way to be defined by
151 # https://github.com/hashicorp/consul/issues/8118
152 # once that ticket is closed.
153 for m in machines:
154 m.wait_until_succeeds(
155 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
156 )
157
158
159 def wait_for_all_machines_alive():
160 """
161 Note that Serf-"alive" does not mean "Raft"-healthy;
162 see `wait_for_healthy_servers()` for that instead.
163 """
164 for m in machines:
165 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
166
167
168 wait_for_healthy_servers()
169 # Also wait for clients to be alive.
170 wait_for_all_machines_alive()
171
172 client1.succeed("consul kv put testkey 42")
173 client2.succeed("[ $(consul kv get testkey) == 42 ]")
174
175
176 def rolling_restart_test(proper_rolling_procedure=True):
177 """
178 Tests that the cluster can tolearate failures of any single server,
179 following the recommended rolling upgrade procedure from
180 https://www.consul.io/docs/upgrading#standard-upgrades.
181
182 Optionally, `proper_rolling_procedure=False` can be given
183 to wait only for each server to be back `Healthy`, not `Stable`
184 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
185 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
186 """
187
188 for server in servers:
189 server.block()
190 server.systemctl("stop consul")
191
192 # Make sure the stopped peer is recognized as being down
193 client1.wait_until_succeeds(
194 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
195 )
196
197 # For each client, wait until they have connection again
198 # using `kv get -recurse` before issuing commands.
199 client1.wait_until_succeeds("consul kv get -recurse")
200 client2.wait_until_succeeds("consul kv get -recurse")
201
202 # Do some consul actions while one server is down.
203 client1.succeed("consul kv put testkey 43")
204 client2.succeed("[ $(consul kv get testkey) == 43 ]")
205 client2.succeed("consul kv delete testkey")
206
207 server.unblock()
208 server.systemctl("start consul")
209
210 if proper_rolling_procedure:
211 # Wait for recovery.
212 wait_for_healthy_servers()
213 else:
214 # NOT proper rolling upgrade procedure, see above.
215 wait_for_all_machines_alive()
216
217 # Wait for client connections.
218 client1.wait_until_succeeds("consul kv get -recurse")
219 client2.wait_until_succeeds("consul kv get -recurse")
220
221 # Do some consul actions with server back up.
222 client1.succeed("consul kv put testkey 44")
223 client2.succeed("[ $(consul kv get testkey) == 44 ]")
224 client2.succeed("consul kv delete testkey")
225
226
227 def all_servers_crash_simultaneously_test():
228 """
229 Tests that the cluster will eventually come back after all
230 servers crash simultaneously.
231 """
232
233 for server in servers:
234 server.block()
235 server.systemctl("stop --no-block consul")
236
237 for server in servers:
238 # --no-block is async, so ensure it has been stopped by now
239 server.wait_until_fails("systemctl is-active --quiet consul")
240 server.unblock()
241 server.systemctl("start consul")
242
243 # Wait for recovery.
244 wait_for_healthy_servers()
245
246 # Wait for client connections.
247 client1.wait_until_succeeds("consul kv get -recurse")
248 client2.wait_until_succeeds("consul kv get -recurse")
249
250 # Do some consul actions with servers back up.
251 client1.succeed("consul kv put testkey 44")
252 client2.succeed("[ $(consul kv get testkey) == 44 ]")
253 client2.succeed("consul kv delete testkey")
254
255
256 # Run the tests.
257
258 print("rolling_restart_test()")
259 rolling_restart_test()
260
261 print("all_servers_crash_simultaneously_test()")
262 all_servers_crash_simultaneously_test()
263
264 print("rolling_restart_test(proper_rolling_procedure=False)")
265 rolling_restart_test(proper_rolling_procedure=False)
266 '';
267}