at master 3.6 kB view raw
1{ pkgs, lib, ... }: 2rec { 3 name = "pacemaker"; 4 meta = with pkgs.lib.maintainers; { 5 maintainers = [ astro ]; 6 }; 7 8 nodes = 9 let 10 node = i: { 11 networking.interfaces.eth1.ipv4.addresses = [ 12 { 13 address = "192.168.0.${toString i}"; 14 prefixLength = 24; 15 } 16 ]; 17 18 services.corosync = { 19 enable = true; 20 clusterName = "zentralwerk-network"; 21 nodelist = lib.imap (i: name: { 22 nodeid = i; 23 inherit name; 24 ring_addrs = [ 25 (builtins.head nodes.${name}.networking.interfaces.eth1.ipv4.addresses).address 26 ]; 27 }) (builtins.attrNames nodes); 28 }; 29 environment.etc."corosync/authkey" = { 30 source = 31 builtins.toFile "authkey" 32 # minimum length: 128 bytes 33 "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest"; 34 mode = "0400"; 35 }; 36 37 services.pacemaker.enable = true; 38 39 # used for pacemaker resource 40 systemd.services.ha-cat = { 41 description = "Highly available netcat"; 42 serviceConfig.ExecStart = "${pkgs.netcat}/bin/nc -l discard"; 43 }; 44 }; 45 in 46 { 47 node1 = node 1; 48 node2 = node 2; 49 node3 = node 3; 50 }; 51 52 # sets up pacemaker with resources configuration, then crashes a 53 # node and waits for service restart on another node 54 testScript = 55 let 56 resources = builtins.toFile "cib-resources.xml" '' 57 <resources> 58 <primitive id="cat" class="systemd" type="ha-cat"> 59 <operations> 60 <op id="stop-cat" name="start" interval="0" timeout="1s"/> 61 <op id="start-cat" name="start" interval="0" timeout="1s"/> 62 <op id="monitor-cat" name="monitor" interval="1s" timeout="1s"/> 63 </operations> 64 </primitive> 65 </resources> 66 ''; 67 in 68 '' 69 import re 70 import time 71 72 start_all() 73 74 ${lib.concatMapStrings (node: '' 75 ${node}.wait_until_succeeds("corosync-quorumtool") 76 ${node}.wait_for_unit("pacemaker.service") 77 '') (builtins.attrNames nodes)} 78 79 # No STONITH device 80 node1.succeed("crm_attribute -t crm_config -n stonith-enabled -v false") 81 # Configure the cat resource 82 node1.succeed("cibadmin --replace --scope resources --xml-file ${resources}") 83 84 # wait until the service is started 85 while True: 86 output = node1.succeed("crm_resource -r cat --locate") 87 match = re.search("is running on: (.+)", output) 88 if match: 89 for machine in machines: 90 if machine.name == match.group(1): 91 current_node = machine 92 break 93 time.sleep(1) 94 95 current_node.log("Service running here!") 96 current_node.crash() 97 98 # pick another node that's still up 99 for machine in machines: 100 if machine.booted: 101 check_node = machine 102 # find where the service has been started next 103 while True: 104 output = check_node.succeed("crm_resource -r cat --locate") 105 match = re.search("is running on: (.+)", output) 106 # output will remain the old current_node until the crash is detected by pacemaker 107 if match and match.group(1) != current_node.name: 108 for machine in machines: 109 if machine.name == match.group(1): 110 next_node = machine 111 break 112 time.sleep(1) 113 114 next_node.log("Service migrated here!") 115 ''; 116}