1import ./make-test-python.nix ({pkgs, lib, ...}:
2
3let
4 # Settings for both servers and agents
5 webUi = true;
6 retry_interval = "1s";
7 raft_multiplier = 1;
8
9 defaultExtraConfig = {
10 inherit retry_interval;
11 performance = {
12 inherit raft_multiplier;
13 };
14 };
15
16 allConsensusServerHosts = [
17 "192.168.1.1"
18 "192.168.1.2"
19 "192.168.1.3"
20 ];
21
22 allConsensusClientHosts = [
23 "192.168.2.1"
24 "192.168.2.2"
25 ];
26
27 firewallSettings = {
28 # See https://www.consul.io/docs/install/ports.html
29 allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
30 allowedUDPPorts = [ 8301 8302 8600 ];
31 };
32
33 client = index: { pkgs, ... }:
34 let
35 ip = builtins.elemAt allConsensusClientHosts index;
36 in
37 {
38 environment.systemPackages = [ pkgs.consul ];
39
40 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
41 { address = ip; prefixLength = 16; }
42 ];
43 networking.firewall = firewallSettings;
44
45 services.consul = {
46 enable = true;
47 inherit webUi;
48 extraConfig = defaultExtraConfig // {
49 server = false;
50 retry_join = allConsensusServerHosts;
51 bind_addr = ip;
52 };
53 };
54 };
55
56 server = index: { pkgs, ... }:
57 let
58 numConsensusServers = builtins.length allConsensusServerHosts;
59 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
60 ip = thisConsensusServerHost; # since we already use IPs to identify servers
61 in
62 {
63 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
64 { address = ip; prefixLength = 16; }
65 ];
66 networking.firewall = firewallSettings;
67
68 services.consul =
69 assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
70 {
71 enable = true;
72 inherit webUi;
73 extraConfig = defaultExtraConfig // {
74 server = true;
75 bootstrap_expect = numConsensusServers;
76 # Tell Consul that we never intend to drop below this many servers.
77 # Ensures to not permanently lose consensus after temporary loss.
78 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
79 autopilot.min_quorum = numConsensusServers;
80 retry_join =
81 # If there's only 1 node in the network, we allow self-join;
82 # otherwise, the node must not try to join itself, and join only the other servers.
83 # See https://github.com/hashicorp/consul/issues/2868
84 if numConsensusServers == 1
85 then allConsensusServerHosts
86 else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
87 bind_addr = ip;
88 };
89 };
90 };
91in {
92 name = "consul";
93
94 nodes = {
95 server1 = server 0;
96 server2 = server 1;
97 server3 = server 2;
98
99 client1 = client 0;
100 client2 = client 1;
101 };
102
103 testScript = ''
104 servers = [server1, server2, server3]
105 machines = [server1, server2, server3, client1, client2]
106
107 for m in machines:
108 m.wait_for_unit("consul.service")
109
110
111 def wait_for_healthy_servers():
112 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
113 # for why the `Voter` column of `list-peers` has that info.
114 # TODO: The `grep true` relies on the fact that currently in
115 # the output like
116 # # consul operator raft list-peers
117 # Node ID Address State Voter RaftProtocol
118 # server3 ... 192.168.1.3:8300 leader true 3
119 # server2 ... 192.168.1.2:8300 follower true 3
120 # server1 ... 192.168.1.1:8300 follower false 3
121 # `Voter`is the only boolean column.
122 # Change this to the more reliable way to be defined by
123 # https://github.com/hashicorp/consul/issues/8118
124 # once that ticket is closed.
125 for m in machines:
126 m.wait_until_succeeds(
127 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
128 )
129
130
131 def wait_for_all_machines_alive():
132 """
133 Note that Serf-"alive" does not mean "Raft"-healthy;
134 see `wait_for_healthy_servers()` for that instead.
135 """
136 for m in machines:
137 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
138
139
140 wait_for_healthy_servers()
141 # Also wait for clients to be alive.
142 wait_for_all_machines_alive()
143
144 client1.succeed("consul kv put testkey 42")
145 client2.succeed("[ $(consul kv get testkey) == 42 ]")
146
147
148 def rolling_reboot_test(proper_rolling_procedure=True):
149 """
150 Tests that the cluster can tolearate failures of any single server,
151 following the recommended rolling upgrade procedure from
152 https://www.consul.io/docs/upgrading#standard-upgrades.
153
154 Optionally, `proper_rolling_procedure=False` can be given
155 to wait only for each server to be back `Healthy`, not `Stable`
156 in the Raft consensus, see Consul setting `ServerStabilizationTime` and
157 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
158 """
159
160 for server in servers:
161 server.crash()
162
163 # For each client, wait until they have connection again
164 # using `kv get -recurse` before issuing commands.
165 client1.wait_until_succeeds("consul kv get -recurse")
166 client2.wait_until_succeeds("consul kv get -recurse")
167
168 # Do some consul actions while one server is down.
169 client1.succeed("consul kv put testkey 43")
170 client2.succeed("[ $(consul kv get testkey) == 43 ]")
171 client2.succeed("consul kv delete testkey")
172
173 # Restart crashed machine.
174 server.start()
175
176 if proper_rolling_procedure:
177 # Wait for recovery.
178 wait_for_healthy_servers()
179 else:
180 # NOT proper rolling upgrade procedure, see above.
181 wait_for_all_machines_alive()
182
183 # Wait for client connections.
184 client1.wait_until_succeeds("consul kv get -recurse")
185 client2.wait_until_succeeds("consul kv get -recurse")
186
187 # Do some consul actions with server back up.
188 client1.succeed("consul kv put testkey 44")
189 client2.succeed("[ $(consul kv get testkey) == 44 ]")
190 client2.succeed("consul kv delete testkey")
191
192
193 def all_servers_crash_simultaneously_test():
194 """
195 Tests that the cluster will eventually come back after all
196 servers crash simultaneously.
197 """
198
199 for server in servers:
200 server.crash()
201
202 for server in servers:
203 server.start()
204
205 # Wait for recovery.
206 wait_for_healthy_servers()
207
208 # Wait for client connections.
209 client1.wait_until_succeeds("consul kv get -recurse")
210 client2.wait_until_succeeds("consul kv get -recurse")
211
212 # Do some consul actions with servers back up.
213 client1.succeed("consul kv put testkey 44")
214 client2.succeed("[ $(consul kv get testkey) == 44 ]")
215 client2.succeed("consul kv delete testkey")
216
217
218 # Run the tests.
219
220 print("rolling_reboot_test()")
221 rolling_reboot_test()
222
223 print("all_servers_crash_simultaneously_test()")
224 all_servers_crash_simultaneously_test()
225
226 print("rolling_reboot_test(proper_rolling_procedure=False)")
227 rolling_reboot_test(proper_rolling_procedure=False)
228 '';
229})