at master 7.6 kB view raw
1import ./make-test-python.nix ( 2 { pkgs, lib, ... }: 3 4 let 5 nodesIps = [ 6 "192.168.1.1" 7 "192.168.1.2" 8 "192.168.1.3" 9 ]; 10 11 createNode = 12 index: 13 { pkgs, ... }: 14 let 15 ip = builtins.elemAt nodesIps index; # since we already use IPs to identify servers 16 in 17 { 18 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 19 { 20 address = ip; 21 prefixLength = 16; 22 } 23 ]; 24 25 networking.firewall.allowedTCPPorts = [ 26 5432 27 8008 28 5010 29 ]; 30 31 environment.systemPackages = [ pkgs.jq ]; 32 33 services.patroni = { 34 35 enable = true; 36 37 postgresqlPackage = pkgs.postgresql_14.withPackages (p: [ p.pg_safeupdate ]); 38 39 scope = "cluster1"; 40 name = "node${toString (index + 1)}"; 41 nodeIp = ip; 42 otherNodesIps = builtins.filter (h: h != ip) nodesIps; 43 softwareWatchdog = true; 44 45 settings = { 46 bootstrap = { 47 dcs = { 48 ttl = 30; 49 loop_wait = 10; 50 retry_timeout = 10; 51 maximum_lag_on_failover = 1048576; 52 }; 53 initdb = [ 54 { encoding = "UTF8"; } 55 "data-checksums" 56 ]; 57 }; 58 59 postgresql = { 60 use_pg_rewind = true; 61 use_slots = true; 62 authentication = { 63 replication = { 64 username = "replicator"; 65 }; 66 superuser = { 67 username = "postgres"; 68 }; 69 rewind = { 70 username = "rewind"; 71 }; 72 }; 73 parameters = { 74 listen_addresses = "${ip}"; 75 wal_level = "replica"; 76 hot_standby_feedback = "on"; 77 unix_socket_directories = "/tmp"; 78 }; 79 pg_hba = [ 80 "host replication replicator 192.168.1.0/24 md5" 81 # Unsafe, do not use for anything other than tests 82 "host all all 0.0.0.0/0 trust" 83 ]; 84 }; 85 86 etcd3 = { 87 host = "192.168.1.4:2379"; 88 }; 89 }; 90 91 environmentFiles = { 92 PATRONI_REPLICATION_PASSWORD = pkgs.writeText "replication-password" "postgres"; 93 PATRONI_SUPERUSER_PASSWORD = pkgs.writeText "superuser-password" "postgres"; 94 PATRONI_REWIND_PASSWORD = pkgs.writeText "rewind-password" "postgres"; 95 }; 96 }; 97 98 # We always want to restart so the tests never hang 99 systemd.services.patroni.serviceConfig.StartLimitIntervalSec = 0; 100 }; 101 in 102 { 103 name = "patroni"; 104 105 nodes = { 106 node1 = createNode 0; 107 node2 = createNode 1; 108 node3 = createNode 2; 109 110 etcd = 111 { pkgs, ... }: 112 { 113 114 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 115 { 116 address = "192.168.1.4"; 117 prefixLength = 16; 118 } 119 ]; 120 121 services.etcd = { 122 enable = true; 123 listenClientUrls = [ "http://192.168.1.4:2379" ]; 124 }; 125 126 networking.firewall.allowedTCPPorts = [ 2379 ]; 127 }; 128 129 client = 130 { pkgs, ... }: 131 { 132 environment.systemPackages = [ pkgs.postgresql_14 ]; 133 134 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ 135 { 136 address = "192.168.2.1"; 137 prefixLength = 16; 138 } 139 ]; 140 141 services.haproxy = { 142 enable = true; 143 config = '' 144 global 145 maxconn 100 146 147 defaults 148 log global 149 mode tcp 150 retries 2 151 timeout client 30m 152 timeout connect 4s 153 timeout server 30m 154 timeout check 5s 155 156 listen cluster1 157 bind 127.0.0.1:5432 158 option httpchk 159 http-check expect status 200 160 default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions 161 ${builtins.concatStringsSep "\n" ( 162 map (ip: "server postgresql_${ip}_5432 ${ip}:5432 maxconn 100 check port 8008") nodesIps 163 )} 164 ''; 165 }; 166 }; 167 }; 168 169 testScript = '' 170 nodes = [node1, node2, node3] 171 172 def wait_for_all_nodes_ready(expected_replicas=2): 173 booted_nodes = filter(lambda node: node.booted, nodes) 174 for node in booted_nodes: 175 print(node.succeed("patronictl list cluster1")) 176 node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'length') == {expected_replicas + 1} ]") 177 node.wait_until_succeeds("[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]") 178 node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]") 179 print(node.succeed("patronictl list cluster1")) 180 client.wait_until_succeeds("psql -h 127.0.0.1 -U postgres --command='select 1;'") 181 182 def run_dummy_queries(): 183 client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'") 184 client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101") 185 client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'") 186 187 start_all() 188 189 etcd.wait_for_unit("etcd.service") 190 191 with subtest("should bootstrap a new patroni cluster"): 192 wait_for_all_nodes_ready() 193 194 with subtest("should be able to insert and select"): 195 client.succeed("psql -h 127.0.0.1 -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'") 196 client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100") 197 198 with subtest("should restart after all nodes are crashed"): 199 for node in nodes: 200 node.crash() 201 for node in nodes: 202 node.start() 203 wait_for_all_nodes_ready() 204 205 with subtest("should be able to run queries while any one node is crashed"): 206 masterNodeName = node1.succeed("patronictl list -f json cluster1 | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip() 207 masterNodeIndex = int(masterNodeName[len(masterNodeName)-1]) - 1 208 209 # Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent) 210 nodes.append(nodes.pop(masterNodeIndex)) 211 212 for node in nodes: 213 node.crash() 214 wait_for_all_nodes_ready(1) 215 216 # Execute some queries while a node is down. 217 run_dummy_queries() 218 219 # Restart crashed node. 220 node.start() 221 wait_for_all_nodes_ready() 222 223 # Execute some queries with the node back up. 224 run_dummy_queries() 225 ''; 226 } 227)