at master 8.5 kB view raw
1{ lib, ... }: 2 3let 4 # Settings for both servers and agents 5 webUi = true; 6 retry_interval = "1s"; 7 raft_multiplier = 1; 8 9 defaultExtraConfig = { 10 inherit retry_interval; 11 performance = { 12 inherit raft_multiplier; 13 }; 14 }; 15 16 allConsensusServerHosts = [ 17 "192.168.1.1" 18 "192.168.1.2" 19 "192.168.1.3" 20 ]; 21 22 allConsensusClientHosts = [ 23 "192.168.2.1" 24 "192.168.2.2" 25 ]; 26 27 firewallSettings = { 28 # See https://www.consul.io/docs/install/ports.html 29 allowedTCPPorts = [ 30 8301 31 8302 32 8600 33 8500 34 8300 35 ]; 36 allowedUDPPorts = [ 37 8301 38 8302 39 8600 40 ]; 41 }; 42 43 client = 44 index: 45 { pkgs, ... }: 46 let 47 ip = builtins.elemAt allConsensusClientHosts index; 48 in 49 { 50 environment.systemPackages = [ pkgs.consul ]; 51 52 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 53 { 54 address = ip; 55 prefixLength = 16; 56 } 57 ]; 58 networking.firewall = firewallSettings; 59 60 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ]; 61 62 services.consul = { 63 enable = true; 64 inherit webUi; 65 extraConfig = defaultExtraConfig // { 66 server = false; 67 retry_join = allConsensusServerHosts; 68 bind_addr = ip; 69 }; 70 }; 71 }; 72 73 server = 74 index: 75 { pkgs, ... }: 76 let 77 numConsensusServers = builtins.length allConsensusServerHosts; 78 thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; 79 ip = thisConsensusServerHost; # since we already use IPs to identify servers 80 in 81 { 82 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 83 { 84 address = ip; 85 prefixLength = 16; 86 } 87 ]; 88 networking.firewall = firewallSettings; 89 90 nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ]; 91 92 services.consul = 93 assert builtins.elem thisConsensusServerHost allConsensusServerHosts; 94 { 95 enable = true; 96 inherit webUi; 97 extraConfig = defaultExtraConfig // { 98 server = true; 99 bootstrap_expect = numConsensusServers; 100 # Tell Consul that we never intend to drop below this many servers. 101 # Ensures to not permanently lose consensus after temporary loss. 102 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 103 autopilot.min_quorum = numConsensusServers; 104 retry_join = 105 # If there's only 1 node in the network, we allow self-join; 106 # otherwise, the node must not try to join itself, and join only the other servers. 107 # See https://github.com/hashicorp/consul/issues/2868 108 if numConsensusServers == 1 then 109 allConsensusServerHosts 110 else 111 builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; 112 bind_addr = ip; 113 }; 114 }; 115 }; 116in 117{ 118 name = "consul"; 119 120 node.pkgsReadOnly = false; 121 122 nodes = { 123 server1 = server 0; 124 server2 = server 1; 125 server3 = server 2; 126 127 client1 = client 0; 128 client2 = client 1; 129 }; 130 131 testScript = '' 132 servers = [server1, server2, server3] 133 machines = [server1, server2, server3, client1, client2] 134 135 for m in machines: 136 m.wait_for_unit("consul.service") 137 138 139 def wait_for_healthy_servers(): 140 # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 141 # for why the `Voter` column of `list-peers` has that info. 142 # TODO: The `grep true` relies on the fact that currently in 143 # the output like 144 # # consul operator raft list-peers 145 # Node ID Address State Voter RaftProtocol 146 # server3 ... 192.168.1.3:8300 leader true 3 147 # server2 ... 192.168.1.2:8300 follower true 3 148 # server1 ... 192.168.1.1:8300 follower false 3 149 # `Voter`is the only boolean column. 150 # Change this to the more reliable way to be defined by 151 # https://github.com/hashicorp/consul/issues/8118 152 # once that ticket is closed. 153 for m in machines: 154 m.wait_until_succeeds( 155 "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" 156 ) 157 158 159 def wait_for_all_machines_alive(): 160 """ 161 Note that Serf-"alive" does not mean "Raft"-healthy; 162 see `wait_for_healthy_servers()` for that instead. 163 """ 164 for m in machines: 165 m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") 166 167 168 wait_for_healthy_servers() 169 # Also wait for clients to be alive. 170 wait_for_all_machines_alive() 171 172 client1.succeed("consul kv put testkey 42") 173 client2.succeed("[ $(consul kv get testkey) == 42 ]") 174 175 176 def rolling_restart_test(proper_rolling_procedure=True): 177 """ 178 Tests that the cluster can tolearate failures of any single server, 179 following the recommended rolling upgrade procedure from 180 https://www.consul.io/docs/upgrading#standard-upgrades. 181 182 Optionally, `proper_rolling_procedure=False` can be given 183 to wait only for each server to be back `Healthy`, not `Stable` 184 in the Raft consensus, see Consul setting `ServerStabilizationTime` and 185 https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040. 186 """ 187 188 for server in servers: 189 server.block() 190 server.systemctl("stop consul") 191 192 # Make sure the stopped peer is recognized as being down 193 client1.wait_until_succeeds( 194 f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]" 195 ) 196 197 # For each client, wait until they have connection again 198 # using `kv get -recurse` before issuing commands. 199 client1.wait_until_succeeds("consul kv get -recurse") 200 client2.wait_until_succeeds("consul kv get -recurse") 201 202 # Do some consul actions while one server is down. 203 client1.succeed("consul kv put testkey 43") 204 client2.succeed("[ $(consul kv get testkey) == 43 ]") 205 client2.succeed("consul kv delete testkey") 206 207 server.unblock() 208 server.systemctl("start consul") 209 210 if proper_rolling_procedure: 211 # Wait for recovery. 212 wait_for_healthy_servers() 213 else: 214 # NOT proper rolling upgrade procedure, see above. 215 wait_for_all_machines_alive() 216 217 # Wait for client connections. 218 client1.wait_until_succeeds("consul kv get -recurse") 219 client2.wait_until_succeeds("consul kv get -recurse") 220 221 # Do some consul actions with server back up. 222 client1.succeed("consul kv put testkey 44") 223 client2.succeed("[ $(consul kv get testkey) == 44 ]") 224 client2.succeed("consul kv delete testkey") 225 226 227 def all_servers_crash_simultaneously_test(): 228 """ 229 Tests that the cluster will eventually come back after all 230 servers crash simultaneously. 231 """ 232 233 for server in servers: 234 server.block() 235 server.systemctl("stop --no-block consul") 236 237 for server in servers: 238 # --no-block is async, so ensure it has been stopped by now 239 server.wait_until_fails("systemctl is-active --quiet consul") 240 server.unblock() 241 server.systemctl("start consul") 242 243 # Wait for recovery. 244 wait_for_healthy_servers() 245 246 # Wait for client connections. 247 client1.wait_until_succeeds("consul kv get -recurse") 248 client2.wait_until_succeeds("consul kv get -recurse") 249 250 # Do some consul actions with servers back up. 251 client1.succeed("consul kv put testkey 44") 252 client2.succeed("[ $(consul kv get testkey) == 44 ]") 253 client2.succeed("consul kv delete testkey") 254 255 256 # Run the tests. 257 258 print("rolling_restart_test()") 259 rolling_restart_test() 260 261 print("all_servers_crash_simultaneously_test()") 262 all_servers_crash_simultaneously_test() 263 264 print("rolling_restart_test(proper_rolling_procedure=False)") 265 rolling_restart_test(proper_rolling_procedure=False) 266 ''; 267}