1let
2 grpcPort = 19090;
3 queryPort = 9090;
4 minioPort = 9000;
5 pushgwPort = 9091;
6 frontPort = 9092;
7
8 s3 = {
9 accessKey = "BKIKJAA5BMMU2RHO6IBB";
10 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
11 };
12
13 objstore.config = {
14 type = "S3";
15 config = {
16 bucket = "thanos-bucket";
17 endpoint = "s3:${toString minioPort}";
18 region = "us-east-1";
19 access_key = s3.accessKey;
20 secret_key = s3.secretKey;
21 insecure = true;
22 signature_version2 = false;
23 put_user_metadata = { };
24 http_config = {
25 idle_conn_timeout = "0s";
26 insecure_skip_verify = false;
27 };
28 trace = {
29 enable = false;
30 };
31 };
32 };
33
34in
35import ./make-test-python.nix {
36 name = "prometheus";
37
38 nodes = {
39 prometheus =
40 { pkgs, ... }:
41 {
42 virtualisation.diskSize = 2 * 1024;
43 virtualisation.memorySize = 2048;
44 environment.systemPackages = [ pkgs.jq ];
45 networking.firewall.allowedTCPPorts = [ grpcPort ];
46 services.prometheus = {
47 enable = true;
48 enableReload = true;
49 scrapeConfigs = [
50 {
51 job_name = "prometheus";
52 static_configs = [
53 {
54 targets = [ "127.0.0.1:${toString queryPort}" ];
55 labels = {
56 instance = "localhost";
57 };
58 }
59 ];
60 }
61 {
62 job_name = "pushgateway";
63 scrape_interval = "1s";
64 static_configs = [
65 {
66 targets = [ "127.0.0.1:${toString pushgwPort}" ];
67 }
68 ];
69 }
70 ];
71 rules = [
72 ''
73 groups:
74 - name: test
75 rules:
76 - record: testrule
77 expr: count(up{job="prometheus"})
78 ''
79 ];
80 globalConfig = {
81 external_labels = {
82 some_label = "required by thanos";
83 };
84 };
85 extraFlags = [
86 # Required by thanos
87 "--storage.tsdb.min-block-duration=5s"
88 "--storage.tsdb.max-block-duration=5s"
89 ];
90 };
91 services.prometheus.pushgateway = {
92 enable = true;
93 web.listen-address = ":${toString pushgwPort}";
94 persistMetrics = true;
95 persistence.interval = "1s";
96 stateDir = "prometheus-pushgateway";
97 };
98 services.thanos = {
99 sidecar = {
100 enable = true;
101 grpc-address = "0.0.0.0:${toString grpcPort}";
102 inherit objstore;
103 };
104
105 # TODO: Add some tests for these services:
106 #rule = {
107 # enable = true;
108 # http-address = "0.0.0.0:19194";
109 # grpc-address = "0.0.0.0:19193";
110 # query.addresses = [
111 # "localhost:19191"
112 # ];
113 # labels = {
114 # just = "some";
115 # nice = "labels";
116 # };
117 #};
118 #
119 #receive = {
120 # http-address = "0.0.0.0:19195";
121 # enable = true;
122 # labels = {
123 # just = "some";
124 # nice = "labels";
125 # };
126 #};
127 };
128 # Adds a "specialisation" of the above config which allows us to
129 # "switch" to it and see if the services.prometheus.enableReload
130 # functionality actually reloads the prometheus service instead of
131 # restarting it.
132 specialisation = {
133 "prometheus-config-change" = {
134 configuration = {
135 environment.systemPackages = [ pkgs.yq ];
136
137 # This configuration just adds a new prometheus job
138 # to scrape the node_exporter metrics of the s3 machine.
139 services.prometheus = {
140 scrapeConfigs = [
141 {
142 job_name = "s3-node_exporter";
143 static_configs = [
144 {
145 targets = [ "s3:9100" ];
146 }
147 ];
148 }
149 ];
150 };
151 };
152 };
153 };
154 };
155
156 query =
157 { pkgs, ... }:
158 {
159 environment.systemPackages = [ pkgs.jq ];
160 services.thanos.query = {
161 enable = true;
162 http-address = "0.0.0.0:${toString queryPort}";
163 endpoints = [
164 "prometheus:${toString grpcPort}"
165 ];
166 };
167 services.thanos.query-frontend = {
168 enable = true;
169 http-address = "0.0.0.0:${toString frontPort}";
170 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}";
171 };
172 };
173
174 store =
175 { pkgs, ... }:
176 {
177 virtualisation.diskSize = 2 * 1024;
178 virtualisation.memorySize = 2048;
179 environment.systemPackages = with pkgs; [
180 jq
181 thanos
182 ];
183 services.thanos.store = {
184 enable = true;
185 http-address = "0.0.0.0:10902";
186 grpc-address = "0.0.0.0:${toString grpcPort}";
187 inherit objstore;
188 sync-block-duration = "1s";
189 };
190 services.thanos.compact = {
191 enable = true;
192 http-address = "0.0.0.0:10903";
193 inherit objstore;
194 consistency-delay = "5s";
195 };
196 services.thanos.query = {
197 enable = true;
198 http-address = "0.0.0.0:${toString queryPort}";
199 endpoints = [
200 "localhost:${toString grpcPort}"
201 ];
202 };
203 };
204
205 s3 =
206 { pkgs, ... }:
207 {
208 # Minio requires at least 1GiB of free disk space to run.
209 virtualisation = {
210 diskSize = 2 * 1024;
211 };
212 networking.firewall.allowedTCPPorts = [ minioPort ];
213
214 services.minio = {
215 enable = true;
216 inherit (s3) accessKey secretKey;
217 };
218
219 environment.systemPackages = [ pkgs.minio-client ];
220
221 services.prometheus.exporters.node = {
222 enable = true;
223 openFirewall = true;
224 };
225 };
226 };
227
228 testScript =
229 { nodes, ... }:
230 ''
231 # Before starting the other machines we first make sure that our S3 service is online
232 # and has a bucket added for thanos:
233 s3.start()
234 s3.wait_for_unit("minio.service")
235 s3.wait_for_open_port(${toString minioPort})
236 s3.succeed(
237 "mc config host add minio "
238 + "http://localhost:${toString minioPort} "
239 + "${s3.accessKey} ${s3.secretKey} --api s3v4",
240 "mc mb minio/thanos-bucket",
241 )
242
243 # Now that s3 has started we can start the other machines:
244 for machine in prometheus, query, store:
245 machine.start()
246
247 # Check if prometheus responds to requests:
248 prometheus.wait_for_unit("prometheus.service")
249
250 prometheus.wait_for_open_port(${toString queryPort})
251 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
252
253 # Let's test if pushing a metric to the pushgateway succeeds:
254 prometheus.wait_for_unit("pushgateway.service")
255 prometheus.succeed(
256 "echo 'some_metric 3.14' | "
257 + "curl -f --data-binary \@- "
258 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
259 )
260
261 # Now check whether that metric gets ingested by prometheus.
262 # Since we'll check for the metric several times on different machines
263 # we abstract the test using the following function:
264
265 # Function to check if the metric "some_metric" has been received and returns the correct value.
266 def wait_for_metric(machine):
267 return machine.wait_until_succeeds(
268 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
269 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
270 )
271
272
273 wait_for_metric(prometheus)
274
275 # Let's test if the pushgateway persists metrics to the configured location.
276 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
277
278 # Test thanos
279 prometheus.wait_for_unit("thanos-sidecar.service")
280
281 # Test if the Thanos query service can correctly retrieve the metric that was send above.
282 query.wait_for_unit("thanos-query.service")
283 wait_for_metric(query)
284
285 # Test Thanos query frontend service
286 query.wait_for_unit("thanos-query-frontend.service")
287 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy")
288
289 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
290 # Thanos storage service has correctly downloaded it from S3 and if the Thanos
291 # query service running on $store can correctly retrieve the metric:
292 store.wait_for_unit("thanos-store.service")
293 wait_for_metric(store)
294
295 store.wait_for_unit("thanos-compact.service")
296
297 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
298 # and check if the blocks have the correct labels:
299 store.succeed(
300 "thanos tools bucket ls "
301 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
302 + "--output=json | "
303 + "jq .thanos.labels.some_label | "
304 + "grep 'required by thanos'"
305 )
306 '';
307}