at 23.05-pre 7.7 kB view raw
1import ./make-test-python.nix ({pkgs, lib, ...}: 2 3let 4 # Settings for both servers and agents 5 webUi = true; 6 retry_interval = "1s"; 7 raft_multiplier = 1; 8 9 defaultExtraConfig = { 10 inherit retry_interval; 11 performance = { 12 inherit raft_multiplier; 13 }; 14 }; 15 16 allConsensusServerHosts = [ 17 "192.168.1.1" 18 "192.168.1.2" 19 "192.168.1.3" 20 ]; 21 22 allConsensusClientHosts = [ 23 "192.168.2.1" 24 "192.168.2.2" 25 ]; 26 27 firewallSettings = { 28 # See https://www.consul.io/docs/install/ports.html 29 allowedTCPPorts = [ 8301 8302 8600 8500 8300 ]; 30 allowedUDPPorts = [ 8301 8302 8600 ]; 31 }; 32 33 client = index: { pkgs, ... }: 34 let 35 ip = builtins.elemAt allConsensusClientHosts index; 36 in 37 { 38 environment.systemPackages = [ pkgs.consul ]; 39 40 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 41 { address = ip; prefixLength = 16; } 42 ]; 43 networking.firewall = firewallSettings; 44 45 services.consul = { 46 enable = true; 47 inherit webUi; 48 extraConfig = defaultExtraConfig // { 49 server = false; 50 retry_join = allConsensusServerHosts; 51 bind_addr = ip; 52 }; 53 }; 54 }; 55 56 server = index: { pkgs, ... }: 57 let 58 numConsensusServers = builtins.length allConsensusServerHosts; 59 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; 60 ip = thisConsensusServerHost; # since we already use IPs to identify servers 61 in 62 { 63 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 64 { address = ip; prefixLength = 16; } 65 ]; 66 networking.firewall = firewallSettings; 67 68 services.consul = 69 assert builtins.elem thisConsensusServerHost allConsensusServerHosts; 70 { 71 enable = true; 72 inherit webUi; 73 extraConfig = defaultExtraConfig // { 74 server = true; 75 bootstrap_expect = numConsensusServers; 76 # Tell Consul that we never intend to drop below this many servers. 77 # Ensures to not permanently lose consensus after temporary loss. 78 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 79 autopilot.min_quorum = numConsensusServers; 80 retry_join = 81 # If there's only 1 node in the network, we allow self-join; 82 # otherwise, the node must not try to join itself, and join only the other servers. 83 # See https://github.com/hashicorp/consul/issues/2868 84 if numConsensusServers == 1 85 then allConsensusServerHosts 86 else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; 87 bind_addr = ip; 88 }; 89 }; 90 }; 91in { 92 name = "consul"; 93 94 nodes = { 95 server1 = server 0; 96 server2 = server 1; 97 server3 = server 2; 98 99 client1 = client 0; 100 client2 = client 1; 101 }; 102 103 testScript = '' 104 servers = [server1, server2, server3] 105 machines = [server1, server2, server3, client1, client2] 106 107 for m in machines: 108 m.wait_for_unit("consul.service") 109 110 111 def wait_for_healthy_servers(): 112 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 113 # for why the `Voter` column of `list-peers` has that info. 114 # TODO: The `grep true` relies on the fact that currently in 115 # the output like 116 # # consul operator raft list-peers 117 # Node ID Address State Voter RaftProtocol 118 # server3 ... 192.168.1.3:8300 leader true 3 119 # server2 ... 192.168.1.2:8300 follower true 3 120 # server1 ... 192.168.1.1:8300 follower false 3 121 # `Voter`is the only boolean column. 122 # Change this to the more reliable way to be defined by 123 # https://github.com/hashicorp/consul/issues/8118 124 # once that ticket is closed. 125 for m in machines: 126 m.wait_until_succeeds( 127 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" 128 ) 129 130 131 def wait_for_all_machines_alive(): 132 """ 133 Note that Serf-"alive" does not mean "Raft"-healthy; 134 see `wait_for_healthy_servers()` for that instead. 135 """ 136 for m in machines: 137 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") 138 139 140 wait_for_healthy_servers() 141 # Also wait for clients to be alive. 142 wait_for_all_machines_alive() 143 144 client1.succeed("consul kv put testkey 42") 145 client2.succeed("[ $(consul kv get testkey) == 42 ]") 146 147 148 def rolling_reboot_test(proper_rolling_procedure=True): 149 """ 150 Tests that the cluster can tolearate failures of any single server, 151 following the recommended rolling upgrade procedure from 152 https://www.consul.io/docs/upgrading#standard-upgrades. 153 154 Optionally, `proper_rolling_procedure=False` can be given 155 to wait only for each server to be back `Healthy`, not `Stable` 156 in the Raft consensus, see Consul setting `ServerStabilizationTime` and 157 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040. 158 """ 159 160 for server in servers: 161 server.crash() 162 163 # For each client, wait until they have connection again 164 # using `kv get -recurse` before issuing commands. 165 client1.wait_until_succeeds("consul kv get -recurse") 166 client2.wait_until_succeeds("consul kv get -recurse") 167 168 # Do some consul actions while one server is down. 169 client1.succeed("consul kv put testkey 43") 170 client2.succeed("[ $(consul kv get testkey) == 43 ]") 171 client2.succeed("consul kv delete testkey") 172 173 # Restart crashed machine. 174 server.start() 175 176 if proper_rolling_procedure: 177 # Wait for recovery. 178 wait_for_healthy_servers() 179 else: 180 # NOT proper rolling upgrade procedure, see above. 181 wait_for_all_machines_alive() 182 183 # Wait for client connections. 184 client1.wait_until_succeeds("consul kv get -recurse") 185 client2.wait_until_succeeds("consul kv get -recurse") 186 187 # Do some consul actions with server back up. 188 client1.succeed("consul kv put testkey 44") 189 client2.succeed("[ $(consul kv get testkey) == 44 ]") 190 client2.succeed("consul kv delete testkey") 191 192 193 def all_servers_crash_simultaneously_test(): 194 """ 195 Tests that the cluster will eventually come back after all 196 servers crash simultaneously. 197 """ 198 199 for server in servers: 200 server.crash() 201 202 for server in servers: 203 server.start() 204 205 # Wait for recovery. 206 wait_for_healthy_servers() 207 208 # Wait for client connections. 209 client1.wait_until_succeeds("consul kv get -recurse") 210 client2.wait_until_succeeds("consul kv get -recurse") 211 212 # Do some consul actions with servers back up. 213 client1.succeed("consul kv put testkey 44") 214 client2.succeed("[ $(consul kv get testkey) == 44 ]") 215 client2.succeed("consul kv delete testkey") 216 217 218 # Run the tests. 219 220 print("rolling_reboot_test()") 221 rolling_reboot_test() 222 223 print("all_servers_crash_simultaneously_test()") 224 all_servers_crash_simultaneously_test() 225 226 print("rolling_reboot_test(proper_rolling_procedure=False)") 227 rolling_reboot_test(proper_rolling_procedure=False) 228 ''; 229})