1import ./make-test-python.nix (
2 { pkgs, lib, ... }:
3 rec {
4 name = "pacemaker";
5 meta = with pkgs.lib.maintainers; {
6 maintainers = [ astro ];
7 };
8
9 nodes =
10 let
11 node = i: {
12 networking.interfaces.eth1.ipv4.addresses = [
13 {
14 address = "192.168.0.${toString i}";
15 prefixLength = 24;
16 }
17 ];
18
19 services.corosync = {
20 enable = true;
21 clusterName = "zentralwerk-network";
22 nodelist = lib.imap (i: name: {
23 nodeid = i;
24 inherit name;
25 ring_addrs = [
26 (builtins.head nodes.${name}.networking.interfaces.eth1.ipv4.addresses).address
27 ];
28 }) (builtins.attrNames nodes);
29 };
30 environment.etc."corosync/authkey" = {
31 source =
32 builtins.toFile "authkey"
33 # minimum length: 128 bytes
34 "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
35 mode = "0400";
36 };
37
38 services.pacemaker.enable = true;
39
40 # used for pacemaker resource
41 systemd.services.ha-cat = {
42 description = "Highly available netcat";
43 serviceConfig.ExecStart = "${pkgs.netcat}/bin/nc -l discard";
44 };
45 };
46 in
47 {
48 node1 = node 1;
49 node2 = node 2;
50 node3 = node 3;
51 };
52
53 # sets up pacemaker with resources configuration, then crashes a
54 # node and waits for service restart on another node
55 testScript =
56 let
57 resources = builtins.toFile "cib-resources.xml" ''
58 <resources>
59 <primitive id="cat" class="systemd" type="ha-cat">
60 <operations>
61 <op id="stop-cat" name="start" interval="0" timeout="1s"/>
62 <op id="start-cat" name="start" interval="0" timeout="1s"/>
63 <op id="monitor-cat" name="monitor" interval="1s" timeout="1s"/>
64 </operations>
65 </primitive>
66 </resources>
67 '';
68 in
69 ''
70 import re
71 import time
72
73 start_all()
74
75 ${lib.concatMapStrings (node: ''
76 ${node}.wait_until_succeeds("corosync-quorumtool")
77 ${node}.wait_for_unit("pacemaker.service")
78 '') (builtins.attrNames nodes)}
79
80 # No STONITH device
81 node1.succeed("crm_attribute -t crm_config -n stonith-enabled -v false")
82 # Configure the cat resource
83 node1.succeed("cibadmin --replace --scope resources --xml-file ${resources}")
84
85 # wait until the service is started
86 while True:
87 output = node1.succeed("crm_resource -r cat --locate")
88 match = re.search("is running on: (.+)", output)
89 if match:
90 for machine in machines:
91 if machine.name == match.group(1):
92 current_node = machine
93 break
94 time.sleep(1)
95
96 current_node.log("Service running here!")
97 current_node.crash()
98
99 # pick another node that's still up
100 for machine in machines:
101 if machine.booted:
102 check_node = machine
103 # find where the service has been started next
104 while True:
105 output = check_node.succeed("crm_resource -r cat --locate")
106 match = re.search("is running on: (.+)", output)
107 # output will remain the old current_node until the crash is detected by pacemaker
108 if match and match.group(1) != current_node.name:
109 for machine in machines:
110 if machine.name == match.group(1):
111 next_node = machine
112 break
113 time.sleep(1)
114
115 next_node.log("Service migrated here!")
116 '';
117 }
118)