1{ ... }:
2
3let
4 grpcPort = 19090;
5 queryPort = 9090;
6 minioPort = 9000;
7 pushgwPort = 9091;
8 frontPort = 9092;
9
10 s3 = {
11 accessKey = "BKIKJAA5BMMU2RHO6IBB";
12 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
13 };
14
15 objstore.config = {
16 type = "S3";
17 config = {
18 bucket = "thanos-bucket";
19 endpoint = "s3:${toString minioPort}";
20 region = "us-east-1";
21 access_key = s3.accessKey;
22 secret_key = s3.secretKey;
23 insecure = true;
24 signature_version2 = false;
25 put_user_metadata = { };
26 http_config = {
27 idle_conn_timeout = "0s";
28 insecure_skip_verify = false;
29 };
30 trace = {
31 enable = false;
32 };
33 };
34 };
35in
36{
37 name = "thanos";
38
39 nodes = {
40 prometheus =
41 { pkgs, ... }:
42 {
43 virtualisation.diskSize = 2 * 1024;
44 virtualisation.memorySize = 2048;
45 environment.systemPackages = [
46 pkgs.grpc-health-probe
47 pkgs.jq
48 ];
49 networking.firewall.allowedTCPPorts = [ grpcPort ];
50 services.prometheus = {
51 enable = true;
52 enableReload = true;
53 scrapeConfigs = [
54 {
55 job_name = "prometheus";
56 static_configs = [
57 {
58 targets = [ "127.0.0.1:${toString queryPort}" ];
59 labels = {
60 instance = "localhost";
61 };
62 }
63 ];
64 }
65 {
66 job_name = "pushgateway";
67 scrape_interval = "1s";
68 static_configs = [
69 {
70 targets = [ "127.0.0.1:${toString pushgwPort}" ];
71 }
72 ];
73 }
74 ];
75 rules = [
76 ''
77 groups:
78 - name: test
79 rules:
80 - record: testrule
81 expr: count(up{job="prometheus"})
82 ''
83 ];
84 globalConfig = {
85 external_labels = {
86 some_label = "required by thanos";
87 };
88 };
89 extraFlags = [
90 # Required by thanos
91 "--storage.tsdb.min-block-duration=5s"
92 "--storage.tsdb.max-block-duration=5s"
93 ];
94 };
95 services.prometheus.pushgateway = {
96 enable = true;
97 web.listen-address = ":${toString pushgwPort}";
98 persistMetrics = true;
99 persistence.interval = "1s";
100 stateDir = "prometheus-pushgateway";
101 };
102 services.thanos = {
103 sidecar = {
104 enable = true;
105 grpc-address = "0.0.0.0:${toString grpcPort}";
106 inherit objstore;
107 };
108
109 # TODO: Add some tests for these services:
110 #rule = {
111 # enable = true;
112 # http-address = "0.0.0.0:19194";
113 # grpc-address = "0.0.0.0:19193";
114 # query.addresses = [
115 # "localhost:19191"
116 # ];
117 # labels = {
118 # just = "some";
119 # nice = "labels";
120 # };
121 #};
122 #
123 #receive = {
124 # http-address = "0.0.0.0:19195";
125 # enable = true;
126 # labels = {
127 # just = "some";
128 # nice = "labels";
129 # };
130 #};
131 };
132 # Adds a "specialisation" of the above config which allows us to
133 # "switch" to it and see if the services.prometheus.enableReload
134 # functionality actually reloads the prometheus service instead of
135 # restarting it.
136 specialisation = {
137 "prometheus-config-change" = {
138 configuration = {
139 environment.systemPackages = [ pkgs.yq ];
140
141 # This configuration just adds a new prometheus job
142 # to scrape the node_exporter metrics of the s3 machine.
143 services.prometheus = {
144 scrapeConfigs = [
145 {
146 job_name = "s3-node_exporter";
147 static_configs = [
148 {
149 targets = [ "s3:9100" ];
150 }
151 ];
152 }
153 ];
154 };
155 };
156 };
157 };
158 };
159
160 query =
161 { pkgs, ... }:
162 {
163 environment.systemPackages = [ pkgs.jq ];
164 services.thanos.query = {
165 enable = true;
166 http-address = "0.0.0.0:${toString queryPort}";
167 endpoints = [
168 "prometheus:${toString grpcPort}"
169 ];
170 };
171 services.thanos.query-frontend = {
172 enable = true;
173 http-address = "0.0.0.0:${toString frontPort}";
174 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}";
175 };
176 };
177
178 store =
179 { pkgs, ... }:
180 {
181 virtualisation.diskSize = 2 * 1024;
182 virtualisation.memorySize = 2048;
183 environment.systemPackages = with pkgs; [
184 grpc-health-probe
185 jq
186 thanos
187 ];
188 services.thanos.store = {
189 enable = true;
190 http-address = "0.0.0.0:10902";
191 grpc-address = "0.0.0.0:${toString grpcPort}";
192 inherit objstore;
193 sync-block-duration = "1s";
194 };
195 services.thanos.compact = {
196 enable = true;
197 http-address = "0.0.0.0:10903";
198 inherit objstore;
199 consistency-delay = "5s";
200 };
201 services.thanos.query = {
202 enable = true;
203 http-address = "0.0.0.0:${toString queryPort}";
204 endpoints = [
205 "localhost:${toString grpcPort}"
206 ];
207 };
208 };
209
210 s3 =
211 { pkgs, ... }:
212 {
213 # Minio requires at least 1GiB of free disk space to run.
214 virtualisation = {
215 diskSize = 2 * 1024;
216 };
217 networking.firewall.allowedTCPPorts = [ minioPort ];
218
219 services.minio = {
220 enable = true;
221 inherit (s3) accessKey secretKey;
222 };
223
224 environment.systemPackages = [ pkgs.minio-client ];
225
226 services.prometheus.exporters.node = {
227 enable = true;
228 openFirewall = true;
229 };
230 };
231 };
232
233 testScript =
234 { nodes, ... }:
235 ''
236 # Before starting the other machines we first make sure that our S3 service is online
237 # and has a bucket added for thanos:
238 s3.start()
239 s3.wait_for_unit("minio.service")
240 s3.wait_for_open_port(${toString minioPort})
241 s3.succeed(
242 "mc alias set minio "
243 + "http://localhost:${toString minioPort} "
244 + "${s3.accessKey} ${s3.secretKey} --api s3v4",
245 "mc mb minio/thanos-bucket",
246 )
247
248 # Now that s3 has started we can start the other machines:
249 for machine in prometheus, query, store:
250 machine.start()
251
252 # Check if prometheus responds to requests:
253 prometheus.wait_for_unit("prometheus.service")
254
255 prometheus.wait_for_open_port(${toString queryPort})
256 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
257
258 prometheus.wait_until_succeeds("journalctl -o cat -u thanos-sidecar.service | grep 'listening for serving gRPC'")
259
260 store.wait_until_succeeds("journalctl -o cat -u thanos-store.service | grep 'listening for serving gRPC'")
261
262 for machine in prometheus, store:
263 machine.wait_until_succeeds("grpc-health-probe -addr 127.0.0.1:${toString grpcPort}")
264
265 # Let's test if pushing a metric to the pushgateway succeeds:
266 prometheus.wait_for_unit("pushgateway.service")
267 prometheus.succeed(
268 "echo 'some_metric 3.14' | "
269 + "curl -f --data-binary \@- "
270 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
271 )
272
273 # Now check whether that metric gets ingested by prometheus.
274 # Since we'll check for the metric several times on different machines
275 # we abstract the test using the following function:
276
277 # Function to check if the metric "some_metric" has been received and returns the correct value.
278 def wait_for_metric(machine):
279 return machine.wait_until_succeeds(
280 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
281 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
282 )
283
284
285 wait_for_metric(prometheus)
286
287 # Let's test if the pushgateway persists metrics to the configured location.
288 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
289
290 # Test thanos
291 prometheus.wait_for_unit("thanos-sidecar.service")
292
293 # Test if the Thanos query service can correctly retrieve the metric that was send above.
294 query.wait_for_unit("thanos-query.service")
295 wait_for_metric(query)
296
297 # Test Thanos query frontend service
298 query.wait_for_unit("thanos-query-frontend.service")
299 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy")
300
301 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
302 # Thanos storage service has correctly downloaded it from S3 and if the Thanos
303 # query service running on $store can correctly retrieve the metric:
304 store.wait_for_unit("thanos-store.service")
305 wait_for_metric(store)
306
307 store.wait_for_unit("thanos-compact.service")
308
309 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
310 # and check if the blocks have the correct labels:
311 store.succeed(
312 "thanos tools bucket ls "
313 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
314 + "--output=json | "
315 + "jq .thanos.labels.some_label | "
316 + "grep 'required by thanos'"
317 )
318 '';
319}