1{ pkgs, lib, ... }:
2rec {
3 name = "pacemaker";
4 meta = with pkgs.lib.maintainers; {
5 maintainers = [ astro ];
6 };
7
8 nodes =
9 let
10 node = i: {
11 networking.interfaces.eth1.ipv4.addresses = [
12 {
13 address = "192.168.0.${toString i}";
14 prefixLength = 24;
15 }
16 ];
17
18 services.corosync = {
19 enable = true;
20 clusterName = "zentralwerk-network";
21 nodelist = lib.imap (i: name: {
22 nodeid = i;
23 inherit name;
24 ring_addrs = [
25 (builtins.head nodes.${name}.networking.interfaces.eth1.ipv4.addresses).address
26 ];
27 }) (builtins.attrNames nodes);
28 };
29 environment.etc."corosync/authkey" = {
30 source =
31 builtins.toFile "authkey"
32 # minimum length: 128 bytes
33 "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
34 mode = "0400";
35 };
36
37 services.pacemaker.enable = true;
38
39 # used for pacemaker resource
40 systemd.services.ha-cat = {
41 description = "Highly available netcat";
42 serviceConfig.ExecStart = "${pkgs.netcat}/bin/nc -l discard";
43 };
44 };
45 in
46 {
47 node1 = node 1;
48 node2 = node 2;
49 node3 = node 3;
50 };
51
52 # sets up pacemaker with resources configuration, then crashes a
53 # node and waits for service restart on another node
54 testScript =
55 let
56 resources = builtins.toFile "cib-resources.xml" ''
57 <resources>
58 <primitive id="cat" class="systemd" type="ha-cat">
59 <operations>
60 <op id="stop-cat" name="start" interval="0" timeout="1s"/>
61 <op id="start-cat" name="start" interval="0" timeout="1s"/>
62 <op id="monitor-cat" name="monitor" interval="1s" timeout="1s"/>
63 </operations>
64 </primitive>
65 </resources>
66 '';
67 in
68 ''
69 import re
70 import time
71
72 start_all()
73
74 ${lib.concatMapStrings (node: ''
75 ${node}.wait_until_succeeds("corosync-quorumtool")
76 ${node}.wait_for_unit("pacemaker.service")
77 '') (builtins.attrNames nodes)}
78
79 # No STONITH device
80 node1.succeed("crm_attribute -t crm_config -n stonith-enabled -v false")
81 # Configure the cat resource
82 node1.succeed("cibadmin --replace --scope resources --xml-file ${resources}")
83
84 # wait until the service is started
85 while True:
86 output = node1.succeed("crm_resource -r cat --locate")
87 match = re.search("is running on: (.+)", output)
88 if match:
89 for machine in machines:
90 if machine.name == match.group(1):
91 current_node = machine
92 break
93 time.sleep(1)
94
95 current_node.log("Service running here!")
96 current_node.crash()
97
98 # pick another node that's still up
99 for machine in machines:
100 if machine.booted:
101 check_node = machine
102 # find where the service has been started next
103 while True:
104 output = check_node.succeed("crm_resource -r cat --locate")
105 match = re.search("is running on: (.+)", output)
106 # output will remain the old current_node until the crash is detected by pacemaker
107 if match and match.group(1) != current_node.name:
108 for machine in machines:
109 if machine.name == match.group(1):
110 next_node = machine
111 break
112 time.sleep(1)
113
114 next_node.log("Service migrated here!")
115 '';
116}