1let
2 grpcPort = 19090;
3 queryPort = 9090;
4 minioPort = 9000;
5 pushgwPort = 9091;
6 frontPort = 9092;
7
8 s3 = {
9 accessKey = "BKIKJAA5BMMU2RHO6IBB";
10 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
11 };
12
13 objstore.config = {
14 type = "S3";
15 config = {
16 bucket = "thanos-bucket";
17 endpoint = "s3:${toString minioPort}";
18 region = "us-east-1";
19 access_key = s3.accessKey;
20 secret_key = s3.secretKey;
21 insecure = true;
22 signature_version2 = false;
23 put_user_metadata = {};
24 http_config = {
25 idle_conn_timeout = "0s";
26 insecure_skip_verify = false;
27 };
28 trace = {
29 enable = false;
30 };
31 };
32 };
33
34in import ./make-test-python.nix {
35 name = "prometheus";
36
37 nodes = {
38 prometheus = { pkgs, ... }: {
39 virtualisation.diskSize = 2 * 1024;
40 virtualisation.memorySize = 2048;
41 environment.systemPackages = [ pkgs.jq ];
42 networking.firewall.allowedTCPPorts = [ grpcPort ];
43 services.prometheus = {
44 enable = true;
45 enableReload = true;
46 scrapeConfigs = [
47 {
48 job_name = "prometheus";
49 static_configs = [
50 {
51 targets = [ "127.0.0.1:${toString queryPort}" ];
52 labels = { instance = "localhost"; };
53 }
54 ];
55 }
56 {
57 job_name = "pushgateway";
58 scrape_interval = "1s";
59 static_configs = [
60 {
61 targets = [ "127.0.0.1:${toString pushgwPort}" ];
62 }
63 ];
64 }
65 ];
66 rules = [
67 ''
68 groups:
69 - name: test
70 rules:
71 - record: testrule
72 expr: count(up{job="prometheus"})
73 ''
74 ];
75 globalConfig = {
76 external_labels = {
77 some_label = "required by thanos";
78 };
79 };
80 extraFlags = [
81 # Required by thanos
82 "--storage.tsdb.min-block-duration=5s"
83 "--storage.tsdb.max-block-duration=5s"
84 ];
85 };
86 services.prometheus.pushgateway = {
87 enable = true;
88 web.listen-address = ":${toString pushgwPort}";
89 persistMetrics = true;
90 persistence.interval = "1s";
91 stateDir = "prometheus-pushgateway";
92 };
93 services.thanos = {
94 sidecar = {
95 enable = true;
96 grpc-address = "0.0.0.0:${toString grpcPort}";
97 inherit objstore;
98 };
99
100 # TODO: Add some tests for these services:
101 #rule = {
102 # enable = true;
103 # http-address = "0.0.0.0:19194";
104 # grpc-address = "0.0.0.0:19193";
105 # query.addresses = [
106 # "localhost:19191"
107 # ];
108 # labels = {
109 # just = "some";
110 # nice = "labels";
111 # };
112 #};
113 #
114 #receive = {
115 # http-address = "0.0.0.0:19195";
116 # enable = true;
117 # labels = {
118 # just = "some";
119 # nice = "labels";
120 # };
121 #};
122 };
123 # Adds a "specialisation" of the above config which allows us to
124 # "switch" to it and see if the services.prometheus.enableReload
125 # functionality actually reloads the prometheus service instead of
126 # restarting it.
127 specialisation = {
128 "prometheus-config-change" = {
129 configuration = {
130 environment.systemPackages = [ pkgs.yq ];
131
132 # This configuration just adds a new prometheus job
133 # to scrape the node_exporter metrics of the s3 machine.
134 services.prometheus = {
135 scrapeConfigs = [
136 {
137 job_name = "s3-node_exporter";
138 static_configs = [
139 {
140 targets = [ "s3:9100" ];
141 }
142 ];
143 }
144 ];
145 };
146 };
147 };
148 };
149 };
150
151 query = { pkgs, ... }: {
152 environment.systemPackages = [ pkgs.jq ];
153 services.thanos.query = {
154 enable = true;
155 http-address = "0.0.0.0:${toString queryPort}";
156 endpoints = [
157 "prometheus:${toString grpcPort}"
158 ];
159 };
160 services.thanos.query-frontend = {
161 enable = true;
162 http-address = "0.0.0.0:${toString frontPort}";
163 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}";
164 };
165 };
166
167 store = { pkgs, ... }: {
168 virtualisation.diskSize = 2 * 1024;
169 virtualisation.memorySize = 2048;
170 environment.systemPackages = with pkgs; [ jq thanos ];
171 services.thanos.store = {
172 enable = true;
173 http-address = "0.0.0.0:10902";
174 grpc-address = "0.0.0.0:${toString grpcPort}";
175 inherit objstore;
176 sync-block-duration = "1s";
177 };
178 services.thanos.compact = {
179 enable = true;
180 http-address = "0.0.0.0:10903";
181 inherit objstore;
182 consistency-delay = "5s";
183 };
184 services.thanos.query = {
185 enable = true;
186 http-address = "0.0.0.0:${toString queryPort}";
187 endpoints = [
188 "localhost:${toString grpcPort}"
189 ];
190 };
191 };
192
193 s3 = { pkgs, ... } : {
194 # Minio requires at least 1GiB of free disk space to run.
195 virtualisation = {
196 diskSize = 2 * 1024;
197 };
198 networking.firewall.allowedTCPPorts = [ minioPort ];
199
200 services.minio = {
201 enable = true;
202 inherit (s3) accessKey secretKey;
203 };
204
205 environment.systemPackages = [ pkgs.minio-client ];
206
207 services.prometheus.exporters.node = {
208 enable = true;
209 openFirewall = true;
210 };
211 };
212 };
213
214 testScript = { nodes, ... } : ''
215 import json
216
217 # Before starting the other machines we first make sure that our S3 service is online
218 # and has a bucket added for thanos:
219 s3.start()
220 s3.wait_for_unit("minio.service")
221 s3.wait_for_open_port(${toString minioPort})
222 s3.succeed(
223 "mc config host add minio "
224 + "http://localhost:${toString minioPort} "
225 + "${s3.accessKey} ${s3.secretKey} --api s3v4",
226 "mc mb minio/thanos-bucket",
227 )
228
229 # Now that s3 has started we can start the other machines:
230 for machine in prometheus, query, store:
231 machine.start()
232
233 # Check if prometheus responds to requests:
234 prometheus.wait_for_unit("prometheus.service")
235
236 prometheus.wait_for_open_port(${toString queryPort})
237 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
238
239 # Let's test if pushing a metric to the pushgateway succeeds:
240 prometheus.wait_for_unit("pushgateway.service")
241 prometheus.succeed(
242 "echo 'some_metric 3.14' | "
243 + "curl -f --data-binary \@- "
244 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
245 )
246
247 # Now check whether that metric gets ingested by prometheus.
248 # Since we'll check for the metric several times on different machines
249 # we abstract the test using the following function:
250
251 # Function to check if the metric "some_metric" has been received and returns the correct value.
252 def wait_for_metric(machine):
253 return machine.wait_until_succeeds(
254 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
255 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
256 )
257
258
259 wait_for_metric(prometheus)
260
261 # Let's test if the pushgateway persists metrics to the configured location.
262 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
263
264 # Test thanos
265 prometheus.wait_for_unit("thanos-sidecar.service")
266
267 # Test if the Thanos query service can correctly retrieve the metric that was send above.
268 query.wait_for_unit("thanos-query.service")
269 wait_for_metric(query)
270
271 # Test Thanos query frontend service
272 query.wait_for_unit("thanos-query-frontend.service")
273 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy")
274
275 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
276 # Thanos storage service has correctly downloaded it from S3 and if the Thanos
277 # query service running on $store can correctly retrieve the metric:
278 store.wait_for_unit("thanos-store.service")
279 wait_for_metric(store)
280
281 store.wait_for_unit("thanos-compact.service")
282
283 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
284 # and check if the blocks have the correct labels:
285 store.succeed(
286 "thanos tools bucket ls "
287 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
288 + "--output=json | "
289 + "jq .thanos.labels.some_label | "
290 + "grep 'required by thanos'"
291 )
292
293 # Check if switching to a NixOS configuration that changes the prometheus
294 # configuration reloads (instead of restarts) prometheus before the switch
295 # finishes successfully:
296 with subtest("config change reloads prometheus"):
297 # We check if prometheus has finished reloading by looking for the message
298 # "Completed loading of configuration file" in the journal between the start
299 # and finish of switching to the new NixOS configuration.
300 #
301 # To mark the start we record the journal cursor before starting the switch:
302 cursor_before_switching = json.loads(
303 prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
304 )["__CURSOR"]
305
306 # Now we switch:
307 prometheus_config_change = prometheus.succeed(
308 "readlink /run/current-system/specialisation/prometheus-config-change"
309 ).strip()
310 prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
311
312 # Next we retrieve all logs since the start of switching:
313 logs_after_starting_switching = prometheus.succeed(
314 """
315 journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
316 """.format(
317 cursor_before_switching=cursor_before_switching
318 )
319 )
320
321 # Finally we check if the message "Completed loading of configuration file"
322 # occurs before the "finished switching to system configuration" message:
323 finished_switching_msg = (
324 "finished switching to system configuration " + prometheus_config_change
325 )
326 reloaded_before_switching_finished = False
327 finished_switching = False
328 for log_line in logs_after_starting_switching.split("\n"):
329 msg = json.loads(log_line)["MESSAGE"]
330 if "Completed loading of configuration file" in msg:
331 reloaded_before_switching_finished = True
332 if msg == finished_switching_msg:
333 finished_switching = True
334 break
335
336 assert reloaded_before_switching_finished
337 assert finished_switching
338
339 # Check if the reloaded config includes the new s3-node_exporter job:
340 prometheus.succeed(
341 """
342 curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
343 | jq -r .data.yaml \
344 | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \
345 | grep true
346 """
347 )
348 '';
349}