nixos/tests/consul.nix at 23.11-pre · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / nixos / tests / consul.nix
at 23.11-pre 8.2 kB view raw
  1import ./make-test-python.nix ({pkgs, lib, ...}:
  2
  3let
  4  # Settings for both servers and agents
  5  webUi = true;
  6  retry_interval = "1s";
  7  raft_multiplier = 1;
  8
  9  defaultExtraConfig = {
 10    inherit retry_interval;
 11    performance = {
 12      inherit raft_multiplier;
 13    };
 14  };
 15
 16  allConsensusServerHosts = [
 17    "192.168.1.1"
 18    "192.168.1.2"
 19    "192.168.1.3"
 20  ];
 21
 22  allConsensusClientHosts = [
 23    "192.168.2.1"
 24    "192.168.2.2"
 25  ];
 26
 27  firewallSettings = {
 28    # See https://www.consul.io/docs/install/ports.html
 29    allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
 30    allowedUDPPorts = [ 8301 8302 8600 ];
 31  };
 32
 33  client = index: { pkgs, ... }:
 34    let
 35      ip = builtins.elemAt allConsensusClientHosts index;
 36    in
 37      {
 38        environment.systemPackages = [ pkgs.consul ];
 39
 40        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 41          { address = ip; prefixLength = 16; }
 42        ];
 43        networking.firewall = firewallSettings;
 44
 45        services.consul = {
 46          enable = true;
 47          inherit webUi;
 48          extraConfig = defaultExtraConfig // {
 49            server = false;
 50            retry_join = allConsensusServerHosts;
 51            bind_addr = ip;
 52          };
 53        };
 54      };
 55
 56  server = index: { pkgs, ... }:
 57    let
 58      numConsensusServers = builtins.length allConsensusServerHosts;
 59      thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
 60      ip = thisConsensusServerHost; # since we already use IPs to identify servers
 61    in
 62      {
 63        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 64          { address = ip; prefixLength = 16; }
 65        ];
 66        networking.firewall = firewallSettings;
 67
 68        services.consul =
 69          assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
 70          {
 71            enable = true;
 72            inherit webUi;
 73            extraConfig = defaultExtraConfig // {
 74              server = true;
 75              bootstrap_expect = numConsensusServers;
 76              # Tell Consul that we never intend to drop below this many servers.
 77              # Ensures to not permanently lose consensus after temporary loss.
 78              # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
 79              autopilot.min_quorum = numConsensusServers;
 80              retry_join =
 81                # If there's only 1 node in the network, we allow self-join;
 82                # otherwise, the node must not try to join itself, and join only the other servers.
 83                # See https://github.com/hashicorp/consul/issues/2868
 84                if numConsensusServers == 1
 85                  then allConsensusServerHosts
 86                  else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
 87              bind_addr = ip;
 88            };
 89          };
 90      };
 91in {
 92  name = "consul";
 93
 94  nodes = {
 95    server1 = server 0;
 96    server2 = server 1;
 97    server3 = server 2;
 98
 99    client1 = client 0;
100    client2 = client 1;
101  };
102
103  testScript = ''
104    servers = [server1, server2, server3]
105    machines = [server1, server2, server3, client1, client2]
106
107    for m in machines:
108        m.wait_for_unit("consul.service")
109
110
111    def wait_for_healthy_servers():
112        # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
113        # for why the `Voter` column of `list-peers` has that info.
114        # TODO: The `grep true` relies on the fact that currently in
115        #       the output like
116        #           # consul operator raft list-peers
117        #           Node     ID   Address           State     Voter  RaftProtocol
118        #           server3  ...  192.168.1.3:8300  leader    true   3
119        #           server2  ...  192.168.1.2:8300  follower  true   3
120        #           server1  ...  192.168.1.1:8300  follower  false  3
121        #       `Voter`is the only boolean column.
122        #       Change this to the more reliable way to be defined by
123        #       https://github.com/hashicorp/consul/issues/8118
124        #       once that ticket is closed.
125        for m in machines:
126            m.wait_until_succeeds(
127                "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
128            )
129
130
131    def wait_for_all_machines_alive():
132        """
133        Note that Serf-"alive" does not mean "Raft"-healthy;
134        see `wait_for_healthy_servers()` for that instead.
135        """
136        for m in machines:
137            m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
138
139
140    wait_for_healthy_servers()
141    # Also wait for clients to be alive.
142    wait_for_all_machines_alive()
143
144    client1.succeed("consul kv put testkey 42")
145    client2.succeed("[ $(consul kv get testkey) == 42 ]")
146
147
148    def rolling_restart_test(proper_rolling_procedure=True):
149        """
150        Tests that the cluster can tolearate failures of any single server,
151        following the recommended rolling upgrade procedure from
152        https://www.consul.io/docs/upgrading#standard-upgrades.
153
154        Optionally, `proper_rolling_procedure=False` can be given
155        to wait only for each server to be back `Healthy`, not `Stable`
156        in the Raft consensus, see Consul setting `ServerStabilizationTime` and
157        https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
158        """
159
160        for server in servers:
161            server.block()
162            server.systemctl("stop consul")
163
164            # Make sure the stopped peer is recognized as being down
165            client1.wait_until_succeeds(
166              f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
167            )
168
169            # For each client, wait until they have connection again
170            # using `kv get -recurse` before issuing commands.
171            client1.wait_until_succeeds("consul kv get -recurse")
172            client2.wait_until_succeeds("consul kv get -recurse")
173
174            # Do some consul actions while one server is down.
175            client1.succeed("consul kv put testkey 43")
176            client2.succeed("[ $(consul kv get testkey) == 43 ]")
177            client2.succeed("consul kv delete testkey")
178
179            server.unblock()
180            server.systemctl("start consul")
181
182            if proper_rolling_procedure:
183                # Wait for recovery.
184                wait_for_healthy_servers()
185            else:
186                # NOT proper rolling upgrade procedure, see above.
187                wait_for_all_machines_alive()
188
189            # Wait for client connections.
190            client1.wait_until_succeeds("consul kv get -recurse")
191            client2.wait_until_succeeds("consul kv get -recurse")
192
193            # Do some consul actions with server back up.
194            client1.succeed("consul kv put testkey 44")
195            client2.succeed("[ $(consul kv get testkey) == 44 ]")
196            client2.succeed("consul kv delete testkey")
197
198
199    def all_servers_crash_simultaneously_test():
200        """
201        Tests that the cluster will eventually come back after all
202        servers crash simultaneously.
203        """
204
205        for server in servers:
206            server.block()
207            server.systemctl("stop --no-block consul")
208
209        for server in servers:
210            # --no-block is async, so ensure it has been stopped by now
211            server.wait_until_fails("systemctl is-active --quiet consul")
212            server.unblock()
213            server.systemctl("start consul")
214
215        # Wait for recovery.
216        wait_for_healthy_servers()
217
218        # Wait for client connections.
219        client1.wait_until_succeeds("consul kv get -recurse")
220        client2.wait_until_succeeds("consul kv get -recurse")
221
222        # Do some consul actions with servers back up.
223        client1.succeed("consul kv put testkey 44")
224        client2.succeed("[ $(consul kv get testkey) == 44 ]")
225        client2.succeed("consul kv delete testkey")
226
227
228    # Run the tests.
229
230    print("rolling_restart_test()")
231    rolling_restart_test()
232
233    print("all_servers_crash_simultaneously_test()")
234    all_servers_crash_simultaneously_test()
235
236    print("rolling_restart_test(proper_rolling_procedure=False)")
237    rolling_restart_test(proper_rolling_procedure=False)
238  '';
239})