1import ./make-test-python.nix (
2 { pkgs, lib, ... }:
3
4 let
5 nodesIps = [
6 "192.168.1.1"
7 "192.168.1.2"
8 "192.168.1.3"
9 ];
10
11 createNode =
12 index:
13 { pkgs, ... }:
14 let
15 ip = builtins.elemAt nodesIps index; # since we already use IPs to identify servers
16 in
17 {
18 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
19 {
20 address = ip;
21 prefixLength = 16;
22 }
23 ];
24
25 networking.firewall.allowedTCPPorts = [
26 5432
27 8008
28 5010
29 ];
30
31 environment.systemPackages = [ pkgs.jq ];
32
33 services.patroni = {
34
35 enable = true;
36
37 postgresqlPackage = pkgs.postgresql_14.withPackages (p: [ p.pg_safeupdate ]);
38
39 scope = "cluster1";
40 name = "node${toString (index + 1)}";
41 nodeIp = ip;
42 otherNodesIps = builtins.filter (h: h != ip) nodesIps;
43 softwareWatchdog = true;
44
45 settings = {
46 bootstrap = {
47 dcs = {
48 ttl = 30;
49 loop_wait = 10;
50 retry_timeout = 10;
51 maximum_lag_on_failover = 1048576;
52 };
53 initdb = [
54 { encoding = "UTF8"; }
55 "data-checksums"
56 ];
57 };
58
59 postgresql = {
60 use_pg_rewind = true;
61 use_slots = true;
62 authentication = {
63 replication = {
64 username = "replicator";
65 };
66 superuser = {
67 username = "postgres";
68 };
69 rewind = {
70 username = "rewind";
71 };
72 };
73 parameters = {
74 listen_addresses = "${ip}";
75 wal_level = "replica";
76 hot_standby_feedback = "on";
77 unix_socket_directories = "/tmp";
78 };
79 pg_hba = [
80 "host replication replicator 192.168.1.0/24 md5"
81 # Unsafe, do not use for anything other than tests
82 "host all all 0.0.0.0/0 trust"
83 ];
84 };
85
86 etcd3 = {
87 host = "192.168.1.4:2379";
88 };
89 };
90
91 environmentFiles = {
92 PATRONI_REPLICATION_PASSWORD = pkgs.writeText "replication-password" "postgres";
93 PATRONI_SUPERUSER_PASSWORD = pkgs.writeText "superuser-password" "postgres";
94 PATRONI_REWIND_PASSWORD = pkgs.writeText "rewind-password" "postgres";
95 };
96 };
97
98 # We always want to restart so the tests never hang
99 systemd.services.patroni.serviceConfig.StartLimitIntervalSec = 0;
100 };
101 in
102 {
103 name = "patroni";
104
105 nodes = {
106 node1 = createNode 0;
107 node2 = createNode 1;
108 node3 = createNode 2;
109
110 etcd =
111 { pkgs, ... }:
112 {
113
114 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
115 {
116 address = "192.168.1.4";
117 prefixLength = 16;
118 }
119 ];
120
121 services.etcd = {
122 enable = true;
123 listenClientUrls = [ "http://192.168.1.4:2379" ];
124 };
125
126 networking.firewall.allowedTCPPorts = [ 2379 ];
127 };
128
129 client =
130 { pkgs, ... }:
131 {
132 environment.systemPackages = [ pkgs.postgresql_14 ];
133
134 networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
135 {
136 address = "192.168.2.1";
137 prefixLength = 16;
138 }
139 ];
140
141 services.haproxy = {
142 enable = true;
143 config = ''
144 global
145 maxconn 100
146
147 defaults
148 log global
149 mode tcp
150 retries 2
151 timeout client 30m
152 timeout connect 4s
153 timeout server 30m
154 timeout check 5s
155
156 listen cluster1
157 bind 127.0.0.1:5432
158 option httpchk
159 http-check expect status 200
160 default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
161 ${builtins.concatStringsSep "\n" (
162 map (ip: "server postgresql_${ip}_5432 ${ip}:5432 maxconn 100 check port 8008") nodesIps
163 )}
164 '';
165 };
166 };
167 };
168
169 testScript = ''
170 nodes = [node1, node2, node3]
171
172 def wait_for_all_nodes_ready(expected_replicas=2):
173 booted_nodes = filter(lambda node: node.booted, nodes)
174 for node in booted_nodes:
175 print(node.succeed("patronictl list cluster1"))
176 node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'length') == {expected_replicas + 1} ]")
177 node.wait_until_succeeds("[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]")
178 node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]")
179 print(node.succeed("patronictl list cluster1"))
180 client.wait_until_succeeds("psql -h 127.0.0.1 -U postgres --command='select 1;'")
181
182 def run_dummy_queries():
183 client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'")
184 client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101")
185 client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'")
186
187 start_all()
188
189 etcd.wait_for_unit("etcd.service")
190
191 with subtest("should bootstrap a new patroni cluster"):
192 wait_for_all_nodes_ready()
193
194 with subtest("should be able to insert and select"):
195 client.succeed("psql -h 127.0.0.1 -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'")
196 client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100")
197
198 with subtest("should restart after all nodes are crashed"):
199 for node in nodes:
200 node.crash()
201 for node in nodes:
202 node.start()
203 wait_for_all_nodes_ready()
204
205 with subtest("should be able to run queries while any one node is crashed"):
206 masterNodeName = node1.succeed("patronictl list -f json cluster1 | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip()
207 masterNodeIndex = int(masterNodeName[len(masterNodeName)-1]) - 1
208
209 # Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent)
210 nodes.append(nodes.pop(masterNodeIndex))
211
212 for node in nodes:
213 node.crash()
214 wait_for_all_nodes_ready(1)
215
216 # Execute some queries while a node is down.
217 run_dummy_queries()
218
219 # Restart crashed node.
220 node.start()
221 wait_for_all_nodes_ready()
222
223 # Execute some queries with the node back up.
224 run_dummy_queries()
225 '';
226 }
227)