at 25.11-pre 8.9 kB view raw
1import ./make-test-python.nix ( 2 { pkgs, lib, ... }: 3 4 let 5 # Settings for both servers and agents 6 webUi = true; 7 retry_interval = "1s"; 8 raft_multiplier = 1; 9 10 defaultExtraConfig = { 11 inherit retry_interval; 12 performance = { 13 inherit raft_multiplier; 14 }; 15 }; 16 17 allConsensusServerHosts = [ 18 "192.168.1.1" 19 "192.168.1.2" 20 "192.168.1.3" 21 ]; 22 23 allConsensusClientHosts = [ 24 "192.168.2.1" 25 "192.168.2.2" 26 ]; 27 28 firewallSettings = { 29 # See https://www.consul.io/docs/install/ports.html 30 allowedTCPPorts = [ 31 8301 32 8302 33 8600 34 8500 35 8300 36 ]; 37 allowedUDPPorts = [ 38 8301 39 8302 40 8600 41 ]; 42 }; 43 44 client = 45 index: 46 { pkgs, ... }: 47 let 48 ip = builtins.elemAt allConsensusClientHosts index; 49 in 50 { 51 environment.systemPackages = [ pkgs.consul ]; 52 53 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 54 { 55 address = ip; 56 prefixLength = 16; 57 } 58 ]; 59 networking.firewall = firewallSettings; 60 61 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ]; 62 63 services.consul = { 64 enable = true; 65 inherit webUi; 66 extraConfig = defaultExtraConfig // { 67 server = false; 68 retry_join = allConsensusServerHosts; 69 bind_addr = ip; 70 }; 71 }; 72 }; 73 74 server = 75 index: 76 { pkgs, ... }: 77 let 78 numConsensusServers = builtins.length allConsensusServerHosts; 79 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; 80 ip = thisConsensusServerHost; # since we already use IPs to identify servers 81 in 82 { 83 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 84 { 85 address = ip; 86 prefixLength = 16; 87 } 88 ]; 89 networking.firewall = firewallSettings; 90 91 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ]; 92 93 services.consul = 94 assert builtins.elem thisConsensusServerHost allConsensusServerHosts; 95 { 96 enable = true; 97 inherit webUi; 98 extraConfig = defaultExtraConfig // { 99 server = true; 100 bootstrap_expect = numConsensusServers; 101 # Tell Consul that we never intend to drop below this many servers. 102 # Ensures to not permanently lose consensus after temporary loss. 103 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 104 autopilot.min_quorum = numConsensusServers; 105 retry_join = 106 # If there's only 1 node in the network, we allow self-join; 107 # otherwise, the node must not try to join itself, and join only the other servers. 108 # See https://github.com/hashicorp/consul/issues/2868 109 if numConsensusServers == 1 then 110 allConsensusServerHosts 111 else 112 builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; 113 bind_addr = ip; 114 }; 115 }; 116 }; 117 in 118 { 119 name = "consul"; 120 121 nodes = { 122 server1 = server 0; 123 server2 = server 1; 124 server3 = server 2; 125 126 client1 = client 0; 127 client2 = client 1; 128 }; 129 130 testScript = '' 131 servers = [server1, server2, server3] 132 machines = [server1, server2, server3, client1, client2] 133 134 for m in machines: 135 m.wait_for_unit("consul.service") 136 137 138 def wait_for_healthy_servers(): 139 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 140 # for why the `Voter` column of `list-peers` has that info. 141 # TODO: The `grep true` relies on the fact that currently in 142 # the output like 143 # # consul operator raft list-peers 144 # Node ID Address State Voter RaftProtocol 145 # server3 ... 192.168.1.3:8300 leader true 3 146 # server2 ... 192.168.1.2:8300 follower true 3 147 # server1 ... 192.168.1.1:8300 follower false 3 148 # `Voter`is the only boolean column. 149 # Change this to the more reliable way to be defined by 150 # https://github.com/hashicorp/consul/issues/8118 151 # once that ticket is closed. 152 for m in machines: 153 m.wait_until_succeeds( 154 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" 155 ) 156 157 158 def wait_for_all_machines_alive(): 159 """ 160 Note that Serf-"alive" does not mean "Raft"-healthy; 161 see `wait_for_healthy_servers()` for that instead. 162 """ 163 for m in machines: 164 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") 165 166 167 wait_for_healthy_servers() 168 # Also wait for clients to be alive. 169 wait_for_all_machines_alive() 170 171 client1.succeed("consul kv put testkey 42") 172 client2.succeed("[ $(consul kv get testkey) == 42 ]") 173 174 175 def rolling_restart_test(proper_rolling_procedure=True): 176 """ 177 Tests that the cluster can tolearate failures of any single server, 178 following the recommended rolling upgrade procedure from 179 https://www.consul.io/docs/upgrading#standard-upgrades. 180 181 Optionally, `proper_rolling_procedure=False` can be given 182 to wait only for each server to be back `Healthy`, not `Stable` 183 in the Raft consensus, see Consul setting `ServerStabilizationTime` and 184 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040. 185 """ 186 187 for server in servers: 188 server.block() 189 server.systemctl("stop consul") 190 191 # Make sure the stopped peer is recognized as being down 192 client1.wait_until_succeeds( 193 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]" 194 ) 195 196 # For each client, wait until they have connection again 197 # using `kv get -recurse` before issuing commands. 198 client1.wait_until_succeeds("consul kv get -recurse") 199 client2.wait_until_succeeds("consul kv get -recurse") 200 201 # Do some consul actions while one server is down. 202 client1.succeed("consul kv put testkey 43") 203 client2.succeed("[ $(consul kv get testkey) == 43 ]") 204 client2.succeed("consul kv delete testkey") 205 206 server.unblock() 207 server.systemctl("start consul") 208 209 if proper_rolling_procedure: 210 # Wait for recovery. 211 wait_for_healthy_servers() 212 else: 213 # NOT proper rolling upgrade procedure, see above. 214 wait_for_all_machines_alive() 215 216 # Wait for client connections. 217 client1.wait_until_succeeds("consul kv get -recurse") 218 client2.wait_until_succeeds("consul kv get -recurse") 219 220 # Do some consul actions with server back up. 221 client1.succeed("consul kv put testkey 44") 222 client2.succeed("[ $(consul kv get testkey) == 44 ]") 223 client2.succeed("consul kv delete testkey") 224 225 226 def all_servers_crash_simultaneously_test(): 227 """ 228 Tests that the cluster will eventually come back after all 229 servers crash simultaneously. 230 """ 231 232 for server in servers: 233 server.block() 234 server.systemctl("stop --no-block consul") 235 236 for server in servers: 237 # --no-block is async, so ensure it has been stopped by now 238 server.wait_until_fails("systemctl is-active --quiet consul") 239 server.unblock() 240 server.systemctl("start consul") 241 242 # Wait for recovery. 243 wait_for_healthy_servers() 244 245 # Wait for client connections. 246 client1.wait_until_succeeds("consul kv get -recurse") 247 client2.wait_until_succeeds("consul kv get -recurse") 248 249 # Do some consul actions with servers back up. 250 client1.succeed("consul kv put testkey 44") 251 client2.succeed("[ $(consul kv get testkey) == 44 ]") 252 client2.succeed("consul kv delete testkey") 253 254 255 # Run the tests. 256 257 print("rolling_restart_test()") 258 rolling_restart_test() 259 260 print("all_servers_crash_simultaneously_test()") 261 all_servers_crash_simultaneously_test() 262 263 print("rolling_restart_test(proper_rolling_procedure=False)") 264 rolling_restart_test(proper_rolling_procedure=False) 265 ''; 266 } 267)