at master 2.9 kB view raw
1{ lib, pkgs, ... }: 2{ 3 name = "kthxbye"; 4 5 meta = with lib.maintainers; { 6 maintainers = [ nukaduka ]; 7 }; 8 9 nodes.server = 10 { ... }: 11 { 12 environment.systemPackages = with pkgs; [ prometheus-alertmanager ]; 13 services.prometheus = { 14 enable = true; 15 16 globalConfig = { 17 scrape_interval = "5s"; 18 scrape_timeout = "5s"; 19 evaluation_interval = "5s"; 20 }; 21 22 scrapeConfigs = [ 23 { 24 job_name = "prometheus"; 25 scrape_interval = "5s"; 26 static_configs = [ 27 { 28 targets = [ "localhost:9090" ]; 29 } 30 ]; 31 } 32 ]; 33 34 rules = [ 35 '' 36 groups: 37 - name: test 38 rules: 39 - alert: node_up 40 expr: up != 0 41 for: 5s 42 labels: 43 severity: bottom of the barrel 44 annotations: 45 summary: node is fine 46 '' 47 ]; 48 49 alertmanagers = [ 50 { 51 static_configs = [ 52 { 53 targets = [ 54 "localhost:9093" 55 ]; 56 } 57 ]; 58 } 59 ]; 60 61 alertmanager = { 62 enable = true; 63 openFirewall = true; 64 configuration.route = { 65 receiver = "test"; 66 group_wait = "5s"; 67 group_interval = "5s"; 68 group_by = [ "..." ]; 69 }; 70 configuration.receivers = [ 71 { 72 name = "test"; 73 webhook_configs = [ 74 { 75 url = "http://localhost:1234"; 76 } 77 ]; 78 } 79 ]; 80 }; 81 }; 82 83 services.kthxbye = { 84 enable = true; 85 openFirewall = true; 86 extendIfExpiringIn = "30s"; 87 logJSON = true; 88 maxDuration = "15m"; 89 interval = "5s"; 90 }; 91 }; 92 93 testScript = '' 94 with subtest("start the server"): 95 start_all() 96 server.wait_for_unit("prometheus.service") 97 server.wait_for_unit("alertmanager.service") 98 server.wait_for_unit("kthxbye.service") 99 100 server.sleep(2) # wait for units to settle 101 server.systemctl("restart kthxbye.service") # make sure kthxbye comes up after alertmanager 102 server.sleep(2) 103 104 with subtest("set up test silence which expires in 20s"): 105 server.succeed('amtool --alertmanager.url "http://localhost:9093" silence add alertname="node_up" -a "nixosTest" -d "20s" -c "ACK! this server is fine!!"') 106 107 with subtest("wait for 21 seconds and check if the silence is still active"): 108 server.sleep(21) 109 server.systemctl("status kthxbye.service") 110 server.succeed("amtool --alertmanager.url 'http://localhost:9093' silence | grep 'ACK'") 111 ''; 112}