1{ lib, pkgs, ... }:
2{
3 name = "kthxbye";
4
5 meta = with lib.maintainers; {
6 maintainers = [ nukaduka ];
7 };
8
9 nodes.server =
10 { ... }:
11 {
12 environment.systemPackages = with pkgs; [ prometheus-alertmanager ];
13 services.prometheus = {
14 enable = true;
15
16 globalConfig = {
17 scrape_interval = "5s";
18 scrape_timeout = "5s";
19 evaluation_interval = "5s";
20 };
21
22 scrapeConfigs = [
23 {
24 job_name = "prometheus";
25 scrape_interval = "5s";
26 static_configs = [
27 {
28 targets = [ "localhost:9090" ];
29 }
30 ];
31 }
32 ];
33
34 rules = [
35 ''
36 groups:
37 - name: test
38 rules:
39 - alert: node_up
40 expr: up != 0
41 for: 5s
42 labels:
43 severity: bottom of the barrel
44 annotations:
45 summary: node is fine
46 ''
47 ];
48
49 alertmanagers = [
50 {
51 static_configs = [
52 {
53 targets = [
54 "localhost:9093"
55 ];
56 }
57 ];
58 }
59 ];
60
61 alertmanager = {
62 enable = true;
63 openFirewall = true;
64 configuration.route = {
65 receiver = "test";
66 group_wait = "5s";
67 group_interval = "5s";
68 group_by = [ "..." ];
69 };
70 configuration.receivers = [
71 {
72 name = "test";
73 webhook_configs = [
74 {
75 url = "http://localhost:1234";
76 }
77 ];
78 }
79 ];
80 };
81 };
82
83 services.kthxbye = {
84 enable = true;
85 openFirewall = true;
86 extendIfExpiringIn = "30s";
87 logJSON = true;
88 maxDuration = "15m";
89 interval = "5s";
90 };
91 };
92
93 testScript = ''
94 with subtest("start the server"):
95 start_all()
96 server.wait_for_unit("prometheus.service")
97 server.wait_for_unit("alertmanager.service")
98 server.wait_for_unit("kthxbye.service")
99
100 server.sleep(2) # wait for units to settle
101 server.systemctl("restart kthxbye.service") # make sure kthxbye comes up after alertmanager
102 server.sleep(2)
103
104 with subtest("set up test silence which expires in 20s"):
105 server.succeed('amtool --alertmanager.url "http://localhost:9093" silence add alertname="node_up" -a "nixosTest" -d "20s" -c "ACK! this server is fine!!"')
106
107 with subtest("wait for 21 seconds and check if the silence is still active"):
108 server.sleep(21)
109 server.systemctl("status kthxbye.service")
110 server.succeed("amtool --alertmanager.url 'http://localhost:9093' silence | grep 'ACK'")
111 '';
112}