1import ./make-test-python.nix ({pkgs, lib, ...}:
2
3let
4 # Settings for both servers and agents
5 webUi = true;
6 retry_interval = "1s";
7 raft_multiplier = 1;
8
9 defaultExtraConfig = {
10 inherit retry_interval;
11 performance = {
12 inherit raft_multiplier;
13 };
14 };
15
16 allConsensusServerHosts = [
17 "192.168.1.1"
18 "192.168.1.2"
19 "192.168.1.3"
20 ];
21
22 allConsensusClientHosts = [
23 "192.168.2.1"
24 "192.168.2.2"
25 ];
26
27 firewallSettings = {
28 # See https://www.consul.io/docs/install/ports.html
29 allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30 allowedUDPPorts = [ 8301 8302 8600 ];
31 };
32
33 client = index: { pkgs, ... }:
34 let
35 ip = builtins.elemAt allConsensusClientHosts index;
36 in
37 {
38 environment.systemPackages = [ pkgs.consul ];
39
40 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41 { address = ip; prefixLength = 16; }
42 ];
43 networking.firewall = firewallSettings;
44
45 services.consul = {
46 enable = true;
47 inherit webUi;
48 extraConfig = defaultExtraConfig // {
49 server = false;
50 retry_join = allConsensusServerHosts;
51 bind_addr = ip;
52 };
53 };
54 };
55
56 server = index: { pkgs, ... }:
57 let
58 numConsensusServers = builtins.length allConsensusServerHosts;
59 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
60 ip = thisConsensusServerHost; # since we already use IPs to identify servers
61 in
62 {
63 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
64 { address = ip; prefixLength = 16; }
65 ];
66 networking.firewall = firewallSettings;
67
68 services.consul =
69 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
70 {
71 enable = true;
72 inherit webUi;
73 extraConfig = defaultExtraConfig // {
74 server = true;
75 bootstrap_expect = numConsensusServers;
76 # Tell Consul that we never intend to drop below this many servers.
77 # Ensures to not permanently lose consensus after temporary loss.
78 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
79 autopilot.min_quorum = numConsensusServers;
80 retry_join =
81 # If there's only 1 node in the network, we allow self-join;
82 # otherwise, the node must not try to join itself, and join only the other servers.
83 # See https://github.com/hashicorp/consul/issues/2868
84 if numConsensusServers == 1
85 then allConsensusServerHosts
86 else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
87 bind_addr = ip;
88 };
89 };
90 };
91in {
92 name = "consul";
93
94 nodes = {
95 server1 = server 0;
96 server2 = server 1;
97 server3 = server 2;
98
99 client1 = client 0;
100 client2 = client 1;
101 };
102
103 testScript = ''
104 servers = [server1, server2, server3]
105 machines = [server1, server2, server3, client1, client2]
106
107 for m in machines:
108 m.wait_for_unit("consul.service")
109
110
111 def wait_for_healthy_servers():
112 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
113 # for why the `Voter` column of `list-peers` has that info.
114 # TODO: The `grep true` relies on the fact that currently in
115 # the output like
116 # # consul operator raft list-peers
117 # Node ID Address State Voter RaftProtocol
118 # server3 ... 192.168.1.3:8300 leader true 3
119 # server2 ... 192.168.1.2:8300 follower true 3
120 # server1 ... 192.168.1.1:8300 follower false 3
121 # `Voter`is the only boolean column.
122 # Change this to the more reliable way to be defined by
123 # https://github.com/hashicorp/consul/issues/8118
124 # once that ticket is closed.
125 for m in machines:
126 m.wait_until_succeeds(
127 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
128 )
129
130
131 def wait_for_all_machines_alive():
132 """
133 Note that Serf-"alive" does not mean "Raft"-healthy;
134 see `wait_for_healthy_servers()` for that instead.
135 """
136 for m in machines:
137 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
138
139
140 wait_for_healthy_servers()
141 # Also wait for clients to be alive.
142 wait_for_all_machines_alive()
143
144 client1.succeed("consul kv put testkey 42")
145 client2.succeed("[ $(consul kv get testkey) == 42 ]")
146
147
148 def rolling_restart_test(proper_rolling_procedure=True):
149 """
150 Tests that the cluster can tolearate failures of any single server,
151 following the recommended rolling upgrade procedure from
152 https://www.consul.io/docs/upgrading#standard-upgrades.
153
154 Optionally, `proper_rolling_procedure=False` can be given
155 to wait only for each server to be back `Healthy`, not `Stable`
156 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
157 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
158 """
159
160 for server in servers:
161 server.block()
162 server.systemctl("stop consul")
163
164 # Make sure the stopped peer is recognized as being down
165 client1.wait_until_succeeds(
166 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
167 )
168
169 # For each client, wait until they have connection again
170 # using `kv get -recurse` before issuing commands.
171 client1.wait_until_succeeds("consul kv get -recurse")
172 client2.wait_until_succeeds("consul kv get -recurse")
173
174 # Do some consul actions while one server is down.
175 client1.succeed("consul kv put testkey 43")
176 client2.succeed("[ $(consul kv get testkey) == 43 ]")
177 client2.succeed("consul kv delete testkey")
178
179 server.unblock()
180 server.systemctl("start consul")
181
182 if proper_rolling_procedure:
183 # Wait for recovery.
184 wait_for_healthy_servers()
185 else:
186 # NOT proper rolling upgrade procedure, see above.
187 wait_for_all_machines_alive()
188
189 # Wait for client connections.
190 client1.wait_until_succeeds("consul kv get -recurse")
191 client2.wait_until_succeeds("consul kv get -recurse")
192
193 # Do some consul actions with server back up.
194 client1.succeed("consul kv put testkey 44")
195 client2.succeed("[ $(consul kv get testkey) == 44 ]")
196 client2.succeed("consul kv delete testkey")
197
198
199 def all_servers_crash_simultaneously_test():
200 """
201 Tests that the cluster will eventually come back after all
202 servers crash simultaneously.
203 """
204
205 for server in servers:
206 server.block()
207 server.systemctl("stop --no-block consul")
208
209 for server in servers:
210 # --no-block is async, so ensure it has been stopped by now
211 server.wait_until_fails("systemctl is-active --quiet consul")
212 server.unblock()
213 server.systemctl("start consul")
214
215 # Wait for recovery.
216 wait_for_healthy_servers()
217
218 # Wait for client connections.
219 client1.wait_until_succeeds("consul kv get -recurse")
220 client2.wait_until_succeeds("consul kv get -recurse")
221
222 # Do some consul actions with servers back up.
223 client1.succeed("consul kv put testkey 44")
224 client2.succeed("[ $(consul kv get testkey) == 44 ]")
225 client2.succeed("consul kv delete testkey")
226
227
228 # Run the tests.
229
230 print("rolling_restart_test()")
231 rolling_restart_test()
232
233 print("all_servers_crash_simultaneously_test()")
234 all_servers_crash_simultaneously_test()
235
236 print("rolling_restart_test(proper_rolling_procedure=False)")
237 rolling_restart_test(proper_rolling_procedure=False)
238 '';
239})