nixos/tests/consul.nix at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / nixos / tests / consul.nix
at master 8.5 kB view raw
  1{ lib, ... }:
  2
  3let
  4  # Settings for both servers and agents
  5  webUi = true;
  6  retry_interval = "1s";
  7  raft_multiplier = 1;
  8
  9  defaultExtraConfig = {
 10    inherit retry_interval;
 11    performance = {
 12      inherit raft_multiplier;
 13    };
 14  };
 15
 16  allConsensusServerHosts = [
 17    "192.168.1.1"
 18    "192.168.1.2"
 19    "192.168.1.3"
 20  ];
 21
 22  allConsensusClientHosts = [
 23    "192.168.2.1"
 24    "192.168.2.2"
 25  ];
 26
 27  firewallSettings = {
 28    # See https://www.consul.io/docs/install/ports.html
 29    allowedTCPPorts = [
 30      8301
 31      8302
 32      8600
 33      8500
 34      8300
 35    ];
 36    allowedUDPPorts = [
 37      8301
 38      8302
 39      8600
 40    ];
 41  };
 42
 43  client =
 44    index:
 45    { pkgs, ... }:
 46    let
 47      ip = builtins.elemAt allConsensusClientHosts index;
 48    in
 49    {
 50      environment.systemPackages = [ pkgs.consul ];
 51
 52      networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 53        {
 54          address = ip;
 55          prefixLength = 16;
 56        }
 57      ];
 58      networking.firewall = firewallSettings;
 59
 60      nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
 61
 62      services.consul = {
 63        enable = true;
 64        inherit webUi;
 65        extraConfig = defaultExtraConfig // {
 66          server = false;
 67          retry_join = allConsensusServerHosts;
 68          bind_addr = ip;
 69        };
 70      };
 71    };
 72
 73  server =
 74    index:
 75    { pkgs, ... }:
 76    let
 77      numConsensusServers = builtins.length allConsensusServerHosts;
 78      thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
 79      ip = thisConsensusServerHost; # since we already use IPs to identify servers
 80    in
 81    {
 82      networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
 83        {
 84          address = ip;
 85          prefixLength = 16;
 86        }
 87      ];
 88      networking.firewall = firewallSettings;
 89
 90      nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
 91
 92      services.consul =
 93        assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
 94        {
 95          enable = true;
 96          inherit webUi;
 97          extraConfig = defaultExtraConfig // {
 98            server = true;
 99            bootstrap_expect = numConsensusServers;
100            # Tell Consul that we never intend to drop below this many servers.
101            # Ensures to not permanently lose consensus after temporary loss.
102            # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
103            autopilot.min_quorum = numConsensusServers;
104            retry_join =
105              # If there's only 1 node in the network, we allow self-join;
106              # otherwise, the node must not try to join itself, and join only the other servers.
107              # See https://github.com/hashicorp/consul/issues/2868
108              if numConsensusServers == 1 then
109                allConsensusServerHosts
110              else
111                builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
112            bind_addr = ip;
113          };
114        };
115    };
116in
117{
118  name = "consul";
119
120  node.pkgsReadOnly = false;
121
122  nodes = {
123    server1 = server 0;
124    server2 = server 1;
125    server3 = server 2;
126
127    client1 = client 0;
128    client2 = client 1;
129  };
130
131  testScript = ''
132    servers = [server1, server2, server3]
133    machines = [server1, server2, server3, client1, client2]
134
135    for m in machines:
136        m.wait_for_unit("consul.service")
137
138
139    def wait_for_healthy_servers():
140        # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
141        # for why the `Voter` column of `list-peers` has that info.
142        # TODO: The `grep true` relies on the fact that currently in
143        #       the output like
144        #           # consul operator raft list-peers
145        #           Node     ID   Address           State     Voter  RaftProtocol
146        #           server3  ...  192.168.1.3:8300  leader    true   3
147        #           server2  ...  192.168.1.2:8300  follower  true   3
148        #           server1  ...  192.168.1.1:8300  follower  false  3
149        #       `Voter`is the only boolean column.
150        #       Change this to the more reliable way to be defined by
151        #       https://github.com/hashicorp/consul/issues/8118
152        #       once that ticket is closed.
153        for m in machines:
154            m.wait_until_succeeds(
155                "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
156            )
157
158
159    def wait_for_all_machines_alive():
160        """
161        Note that Serf-"alive" does not mean "Raft"-healthy;
162        see `wait_for_healthy_servers()` for that instead.
163        """
164        for m in machines:
165            m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
166
167
168    wait_for_healthy_servers()
169    # Also wait for clients to be alive.
170    wait_for_all_machines_alive()
171
172    client1.succeed("consul kv put testkey 42")
173    client2.succeed("[ $(consul kv get testkey) == 42 ]")
174
175
176    def rolling_restart_test(proper_rolling_procedure=True):
177        """
178        Tests that the cluster can tolearate failures of any single server,
179        following the recommended rolling upgrade procedure from
180        https://www.consul.io/docs/upgrading#standard-upgrades.
181
182        Optionally, `proper_rolling_procedure=False` can be given
183        to wait only for each server to be back `Healthy`, not `Stable`
184        in the Raft consensus, see Consul setting `ServerStabilizationTime` and
185        https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
186        """
187
188        for server in servers:
189            server.block()
190            server.systemctl("stop consul")
191
192            # Make sure the stopped peer is recognized as being down
193            client1.wait_until_succeeds(
194              f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
195            )
196
197            # For each client, wait until they have connection again
198            # using `kv get -recurse` before issuing commands.
199            client1.wait_until_succeeds("consul kv get -recurse")
200            client2.wait_until_succeeds("consul kv get -recurse")
201
202            # Do some consul actions while one server is down.
203            client1.succeed("consul kv put testkey 43")
204            client2.succeed("[ $(consul kv get testkey) == 43 ]")
205            client2.succeed("consul kv delete testkey")
206
207            server.unblock()
208            server.systemctl("start consul")
209
210            if proper_rolling_procedure:
211                # Wait for recovery.
212                wait_for_healthy_servers()
213            else:
214                # NOT proper rolling upgrade procedure, see above.
215                wait_for_all_machines_alive()
216
217            # Wait for client connections.
218            client1.wait_until_succeeds("consul kv get -recurse")
219            client2.wait_until_succeeds("consul kv get -recurse")
220
221            # Do some consul actions with server back up.
222            client1.succeed("consul kv put testkey 44")
223            client2.succeed("[ $(consul kv get testkey) == 44 ]")
224            client2.succeed("consul kv delete testkey")
225
226
227    def all_servers_crash_simultaneously_test():
228        """
229        Tests that the cluster will eventually come back after all
230        servers crash simultaneously.
231        """
232
233        for server in servers:
234            server.block()
235            server.systemctl("stop --no-block consul")
236
237        for server in servers:
238            # --no-block is async, so ensure it has been stopped by now
239            server.wait_until_fails("systemctl is-active --quiet consul")
240            server.unblock()
241            server.systemctl("start consul")
242
243        # Wait for recovery.
244        wait_for_healthy_servers()
245
246        # Wait for client connections.
247        client1.wait_until_succeeds("consul kv get -recurse")
248        client2.wait_until_succeeds("consul kv get -recurse")
249
250        # Do some consul actions with servers back up.
251        client1.succeed("consul kv put testkey 44")
252        client2.succeed("[ $(consul kv get testkey) == 44 ]")
253        client2.succeed("consul kv delete testkey")
254
255
256    # Run the tests.
257
258    print("rolling_restart_test()")
259    rolling_restart_test()
260
261    print("all_servers_crash_simultaneously_test()")
262    all_servers_crash_simultaneously_test()
263
264    print("rolling_restart_test(proper_rolling_procedure=False)")
265    rolling_restart_test(proper_rolling_procedure=False)
266  '';
267}