1let
2 grpcPort = 19090;
3 queryPort = 9090;
4 minioPort = 9000;
5 pushgwPort = 9091;
6
7 s3 = {
8 accessKey = "BKIKJAA5BMMU2RHO6IBB";
9 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
10 };
11
12 objstore.config = {
13 type = "S3";
14 config = {
15 bucket = "thanos-bucket";
16 endpoint = "s3:${toString minioPort}";
17 region = "us-east-1";
18 access_key = s3.accessKey;
19 secret_key = s3.secretKey;
20 insecure = true;
21 signature_version2 = false;
22 put_user_metadata = {};
23 http_config = {
24 idle_conn_timeout = "0s";
25 insecure_skip_verify = false;
26 };
27 trace = {
28 enable = false;
29 };
30 };
31 };
32
33in import ./make-test-python.nix {
34 name = "prometheus";
35
36 nodes = {
37 prometheus = { pkgs, ... }: {
38 virtualisation.diskSize = 2 * 1024;
39 virtualisation.memorySize = 2048;
40 environment.systemPackages = [ pkgs.jq ];
41 networking.firewall.allowedTCPPorts = [ grpcPort ];
42 services.prometheus = {
43 enable = true;
44 enableReload = true;
45 scrapeConfigs = [
46 {
47 job_name = "prometheus";
48 static_configs = [
49 {
50 targets = [ "127.0.0.1:${toString queryPort}" ];
51 labels = { instance = "localhost"; };
52 }
53 ];
54 }
55 {
56 job_name = "pushgateway";
57 scrape_interval = "1s";
58 static_configs = [
59 {
60 targets = [ "127.0.0.1:${toString pushgwPort}" ];
61 }
62 ];
63 }
64 ];
65 rules = [
66 ''
67 groups:
68 - name: test
69 rules:
70 - record: testrule
71 expr: count(up{job="prometheus"})
72 ''
73 ];
74 globalConfig = {
75 external_labels = {
76 some_label = "required by thanos";
77 };
78 };
79 extraFlags = [
80 # Required by thanos
81 "--storage.tsdb.min-block-duration=5s"
82 "--storage.tsdb.max-block-duration=5s"
83 ];
84 };
85 services.prometheus.pushgateway = {
86 enable = true;
87 web.listen-address = ":${toString pushgwPort}";
88 persistMetrics = true;
89 persistence.interval = "1s";
90 stateDir = "prometheus-pushgateway";
91 };
92 services.thanos = {
93 sidecar = {
94 enable = true;
95 grpc-address = "0.0.0.0:${toString grpcPort}";
96 inherit objstore;
97 };
98
99 # TODO: Add some tests for these services:
100 #rule = {
101 # enable = true;
102 # http-address = "0.0.0.0:19194";
103 # grpc-address = "0.0.0.0:19193";
104 # query.addresses = [
105 # "localhost:19191"
106 # ];
107 # labels = {
108 # just = "some";
109 # nice = "labels";
110 # };
111 #};
112 #
113 #receive = {
114 # http-address = "0.0.0.0:19195";
115 # enable = true;
116 # labels = {
117 # just = "some";
118 # nice = "labels";
119 # };
120 #};
121 };
122 # Adds a "specialisation" of the above config which allows us to
123 # "switch" to it and see if the services.prometheus.enableReload
124 # functionality actually reloads the prometheus service instead of
125 # restarting it.
126 specialisation = {
127 "prometheus-config-change" = {
128 configuration = {
129 environment.systemPackages = [ pkgs.yq ];
130
131 # This configuration just adds a new prometheus job
132 # to scrape the node_exporter metrics of the s3 machine.
133 services.prometheus = {
134 scrapeConfigs = [
135 {
136 job_name = "s3-node_exporter";
137 static_configs = [
138 {
139 targets = [ "s3:9100" ];
140 }
141 ];
142 }
143 ];
144 };
145 };
146 };
147 };
148 };
149
150 query = { pkgs, ... }: {
151 environment.systemPackages = [ pkgs.jq ];
152 services.thanos.query = {
153 enable = true;
154 http-address = "0.0.0.0:${toString queryPort}";
155 store.addresses = [
156 "prometheus:${toString grpcPort}"
157 ];
158 };
159 };
160
161 store = { pkgs, ... }: {
162 virtualisation.diskSize = 2 * 1024;
163 virtualisation.memorySize = 2048;
164 environment.systemPackages = with pkgs; [ jq thanos ];
165 services.thanos.store = {
166 enable = true;
167 http-address = "0.0.0.0:10902";
168 grpc-address = "0.0.0.0:${toString grpcPort}";
169 inherit objstore;
170 sync-block-duration = "1s";
171 };
172 services.thanos.compact = {
173 enable = true;
174 http-address = "0.0.0.0:10903";
175 inherit objstore;
176 consistency-delay = "5s";
177 };
178 services.thanos.query = {
179 enable = true;
180 http-address = "0.0.0.0:${toString queryPort}";
181 store.addresses = [
182 "localhost:${toString grpcPort}"
183 ];
184 };
185 };
186
187 s3 = { pkgs, ... } : {
188 # Minio requires at least 1GiB of free disk space to run.
189 virtualisation = {
190 diskSize = 2 * 1024;
191 };
192 networking.firewall.allowedTCPPorts = [ minioPort ];
193
194 services.minio = {
195 enable = true;
196 inherit (s3) accessKey secretKey;
197 };
198
199 environment.systemPackages = [ pkgs.minio-client ];
200
201 services.prometheus.exporters.node = {
202 enable = true;
203 openFirewall = true;
204 };
205 };
206 };
207
208 testScript = { nodes, ... } : ''
209 import json
210
211 # Before starting the other machines we first make sure that our S3 service is online
212 # and has a bucket added for thanos:
213 s3.start()
214 s3.wait_for_unit("minio.service")
215 s3.wait_for_open_port(${toString minioPort})
216 s3.succeed(
217 "mc config host add minio "
218 + "http://localhost:${toString minioPort} "
219 + "${s3.accessKey} ${s3.secretKey} --api s3v4",
220 "mc mb minio/thanos-bucket",
221 )
222
223 # Now that s3 has started we can start the other machines:
224 for machine in prometheus, query, store:
225 machine.start()
226
227 # Check if prometheus responds to requests:
228 prometheus.wait_for_unit("prometheus.service")
229
230 prometheus.wait_for_open_port(${toString queryPort})
231 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
232
233 # Let's test if pushing a metric to the pushgateway succeeds:
234 prometheus.wait_for_unit("pushgateway.service")
235 prometheus.succeed(
236 "echo 'some_metric 3.14' | "
237 + "curl -f --data-binary \@- "
238 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
239 )
240
241 # Now check whether that metric gets ingested by prometheus.
242 # Since we'll check for the metric several times on different machines
243 # we abstract the test using the following function:
244
245 # Function to check if the metric "some_metric" has been received and returns the correct value.
246 def wait_for_metric(machine):
247 return machine.wait_until_succeeds(
248 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
249 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
250 )
251
252
253 wait_for_metric(prometheus)
254
255 # Let's test if the pushgateway persists metrics to the configured location.
256 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
257
258 # Test thanos
259 prometheus.wait_for_unit("thanos-sidecar.service")
260
261 # Test if the Thanos query service can correctly retrieve the metric that was send above.
262 query.wait_for_unit("thanos-query.service")
263 wait_for_metric(query)
264
265 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
266 # Thanos storage service has correctly downloaded it from S3 and if the Thanos
267 # query service running on $store can correctly retrieve the metric:
268 store.wait_for_unit("thanos-store.service")
269 wait_for_metric(store)
270
271 store.wait_for_unit("thanos-compact.service")
272
273 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
274 # and check if the blocks have the correct labels:
275 store.succeed(
276 "thanos tools bucket ls "
277 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
278 + "--output=json | "
279 + "jq .thanos.labels.some_label | "
280 + "grep 'required by thanos'"
281 )
282
283 # Check if switching to a NixOS configuration that changes the prometheus
284 # configuration reloads (instead of restarts) prometheus before the switch
285 # finishes successfully:
286 with subtest("config change reloads prometheus"):
287 # We check if prometheus has finished reloading by looking for the message
288 # "Completed loading of configuration file" in the journal between the start
289 # and finish of switching to the new NixOS configuration.
290 #
291 # To mark the start we record the journal cursor before starting the switch:
292 cursor_before_switching = json.loads(
293 prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
294 )["__CURSOR"]
295
296 # Now we switch:
297 prometheus_config_change = prometheus.succeed(
298 "readlink /run/current-system/specialisation/prometheus-config-change"
299 ).strip()
300 prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
301
302 # Next we retrieve all logs since the start of switching:
303 logs_after_starting_switching = prometheus.succeed(
304 """
305 journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
306 """.format(
307 cursor_before_switching=cursor_before_switching
308 )
309 )
310
311 # Finally we check if the message "Completed loading of configuration file"
312 # occurs before the "finished switching to system configuration" message:
313 finished_switching_msg = (
314 "finished switching to system configuration " + prometheus_config_change
315 )
316 reloaded_before_switching_finished = False
317 finished_switching = False
318 for log_line in logs_after_starting_switching.split("\n"):
319 msg = json.loads(log_line)["MESSAGE"]
320 if "Completed loading of configuration file" in msg:
321 reloaded_before_switching_finished = True
322 if msg == finished_switching_msg:
323 finished_switching = True
324 break
325
326 assert reloaded_before_switching_finished
327 assert finished_switching
328
329 # Check if the reloaded config includes the new s3-node_exporter job:
330 prometheus.succeed(
331 """
332 curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
333 | jq -r .data.yaml \
334 | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \
335 | grep true
336 """
337 )
338 '';
339}