1import ./make-test-python.nix (
2 { pkgs, lib, ... }:
3
4 let
5 # Settings for both servers and agents
6 webUi = true;
7 retry_interval = "1s";
8 raft_multiplier = 1;
9
10 defaultExtraConfig = {
11 inherit retry_interval;
12 performance = {
13 inherit raft_multiplier;
14 };
15 };
16
17 allConsensusServerHosts = [
18 "192.168.1.1"
19 "192.168.1.2"
20 "192.168.1.3"
21 ];
22
23 allConsensusClientHosts = [
24 "192.168.2.1"
25 "192.168.2.2"
26 ];
27
28 firewallSettings = {
29 # See https://www.consul.io/docs/install/ports.html
30 allowedTCPPorts = [
31 8301
32 8302
33 8600
34 8500
35 8300
36 ];
37 allowedUDPPorts = [
38 8301
39 8302
40 8600
41 ];
42 };
43
44 client =
45 index:
46 { pkgs, ... }:
47 let
48 ip = builtins.elemAt allConsensusClientHosts index;
49 in
50 {
51 environment.systemPackages = [ pkgs.consul ];
52
53 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
54 {
55 address = ip;
56 prefixLength = 16;
57 }
58 ];
59 networking.firewall = firewallSettings;
60
61 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
62
63 services.consul = {
64 enable = true;
65 inherit webUi;
66 extraConfig = defaultExtraConfig // {
67 server = false;
68 retry_join = allConsensusServerHosts;
69 bind_addr = ip;
70 };
71 };
72 };
73
74 server =
75 index:
76 { pkgs, ... }:
77 let
78 numConsensusServers = builtins.length allConsensusServerHosts;
79 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
80 ip = thisConsensusServerHost; # since we already use IPs to identify servers
81 in
82 {
83 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
84 {
85 address = ip;
86 prefixLength = 16;
87 }
88 ];
89 networking.firewall = firewallSettings;
90
91 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
92
93 services.consul =
94 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
95 {
96 enable = true;
97 inherit webUi;
98 extraConfig = defaultExtraConfig // {
99 server = true;
100 bootstrap_expect = numConsensusServers;
101 # Tell Consul that we never intend to drop below this many servers.
102 # Ensures to not permanently lose consensus after temporary loss.
103 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
104 autopilot.min_quorum = numConsensusServers;
105 retry_join =
106 # If there's only 1 node in the network, we allow self-join;
107 # otherwise, the node must not try to join itself, and join only the other servers.
108 # See https://github.com/hashicorp/consul/issues/2868
109 if numConsensusServers == 1 then
110 allConsensusServerHosts
111 else
112 builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
113 bind_addr = ip;
114 };
115 };
116 };
117 in
118 {
119 name = "consul";
120
121 nodes = {
122 server1 = server 0;
123 server2 = server 1;
124 server3 = server 2;
125
126 client1 = client 0;
127 client2 = client 1;
128 };
129
130 testScript = ''
131 servers = [server1, server2, server3]
132 machines = [server1, server2, server3, client1, client2]
133
134 for m in machines:
135 m.wait_for_unit("consul.service")
136
137
138 def wait_for_healthy_servers():
139 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
140 # for why the `Voter` column of `list-peers` has that info.
141 # TODO: The `grep true` relies on the fact that currently in
142 # the output like
143 # # consul operator raft list-peers
144 # Node ID Address State Voter RaftProtocol
145 # server3 ... 192.168.1.3:8300 leader true 3
146 # server2 ... 192.168.1.2:8300 follower true 3
147 # server1 ... 192.168.1.1:8300 follower false 3
148 # `Voter`is the only boolean column.
149 # Change this to the more reliable way to be defined by
150 # https://github.com/hashicorp/consul/issues/8118
151 # once that ticket is closed.
152 for m in machines:
153 m.wait_until_succeeds(
154 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
155 )
156
157
158 def wait_for_all_machines_alive():
159 """
160 Note that Serf-"alive" does not mean "Raft"-healthy;
161 see `wait_for_healthy_servers()` for that instead.
162 """
163 for m in machines:
164 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
165
166
167 wait_for_healthy_servers()
168 # Also wait for clients to be alive.
169 wait_for_all_machines_alive()
170
171 client1.succeed("consul kv put testkey 42")
172 client2.succeed("[ $(consul kv get testkey) == 42 ]")
173
174
175 def rolling_restart_test(proper_rolling_procedure=True):
176 """
177 Tests that the cluster can tolearate failures of any single server,
178 following the recommended rolling upgrade procedure from
179 https://www.consul.io/docs/upgrading#standard-upgrades.
180
181 Optionally, `proper_rolling_procedure=False` can be given
182 to wait only for each server to be back `Healthy`, not `Stable`
183 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
184 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
185 """
186
187 for server in servers:
188 server.block()
189 server.systemctl("stop consul")
190
191 # Make sure the stopped peer is recognized as being down
192 client1.wait_until_succeeds(
193 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
194 )
195
196 # For each client, wait until they have connection again
197 # using `kv get -recurse` before issuing commands.
198 client1.wait_until_succeeds("consul kv get -recurse")
199 client2.wait_until_succeeds("consul kv get -recurse")
200
201 # Do some consul actions while one server is down.
202 client1.succeed("consul kv put testkey 43")
203 client2.succeed("[ $(consul kv get testkey) == 43 ]")
204 client2.succeed("consul kv delete testkey")
205
206 server.unblock()
207 server.systemctl("start consul")
208
209 if proper_rolling_procedure:
210 # Wait for recovery.
211 wait_for_healthy_servers()
212 else:
213 # NOT proper rolling upgrade procedure, see above.
214 wait_for_all_machines_alive()
215
216 # Wait for client connections.
217 client1.wait_until_succeeds("consul kv get -recurse")
218 client2.wait_until_succeeds("consul kv get -recurse")
219
220 # Do some consul actions with server back up.
221 client1.succeed("consul kv put testkey 44")
222 client2.succeed("[ $(consul kv get testkey) == 44 ]")
223 client2.succeed("consul kv delete testkey")
224
225
226 def all_servers_crash_simultaneously_test():
227 """
228 Tests that the cluster will eventually come back after all
229 servers crash simultaneously.
230 """
231
232 for server in servers:
233 server.block()
234 server.systemctl("stop --no-block consul")
235
236 for server in servers:
237 # --no-block is async, so ensure it has been stopped by now
238 server.wait_until_fails("systemctl is-active --quiet consul")
239 server.unblock()
240 server.systemctl("start consul")
241
242 # Wait for recovery.
243 wait_for_healthy_servers()
244
245 # Wait for client connections.
246 client1.wait_until_succeeds("consul kv get -recurse")
247 client2.wait_until_succeeds("consul kv get -recurse")
248
249 # Do some consul actions with servers back up.
250 client1.succeed("consul kv put testkey 44")
251 client2.succeed("[ $(consul kv get testkey) == 44 ]")
252 client2.succeed("consul kv delete testkey")
253
254
255 # Run the tests.
256
257 print("rolling_restart_test()")
258 rolling_restart_test()
259
260 print("all_servers_crash_simultaneously_test()")
261 all_servers_crash_simultaneously_test()
262
263 print("rolling_restart_test(proper_rolling_procedure=False)")
264 rolling_restart_test(proper_rolling_procedure=False)
265 '';
266 }
267)