at master 9.8 kB view raw
1{ ... }: 2 3let 4 grpcPort = 19090; 5 queryPort = 9090; 6 minioPort = 9000; 7 pushgwPort = 9091; 8 frontPort = 9092; 9 10 s3 = { 11 accessKey = "BKIKJAA5BMMU2RHO6IBB"; 12 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12"; 13 }; 14 15 objstore.config = { 16 type = "S3"; 17 config = { 18 bucket = "thanos-bucket"; 19 endpoint = "s3:${toString minioPort}"; 20 region = "us-east-1"; 21 access_key = s3.accessKey; 22 secret_key = s3.secretKey; 23 insecure = true; 24 signature_version2 = false; 25 put_user_metadata = { }; 26 http_config = { 27 idle_conn_timeout = "0s"; 28 insecure_skip_verify = false; 29 }; 30 trace = { 31 enable = false; 32 }; 33 }; 34 }; 35in 36{ 37 name = "thanos"; 38 39 nodes = { 40 prometheus = 41 { pkgs, ... }: 42 { 43 virtualisation.diskSize = 2 * 1024; 44 virtualisation.memorySize = 2048; 45 environment.systemPackages = [ 46 pkgs.grpc-health-probe 47 pkgs.jq 48 ]; 49 networking.firewall.allowedTCPPorts = [ grpcPort ]; 50 services.prometheus = { 51 enable = true; 52 enableReload = true; 53 scrapeConfigs = [ 54 { 55 job_name = "prometheus"; 56 static_configs = [ 57 { 58 targets = [ "127.0.0.1:${toString queryPort}" ]; 59 labels = { 60 instance = "localhost"; 61 }; 62 } 63 ]; 64 } 65 { 66 job_name = "pushgateway"; 67 scrape_interval = "1s"; 68 static_configs = [ 69 { 70 targets = [ "127.0.0.1:${toString pushgwPort}" ]; 71 } 72 ]; 73 } 74 ]; 75 rules = [ 76 '' 77 groups: 78 - name: test 79 rules: 80 - record: testrule 81 expr: count(up{job="prometheus"}) 82 '' 83 ]; 84 globalConfig = { 85 external_labels = { 86 some_label = "required by thanos"; 87 }; 88 }; 89 extraFlags = [ 90 # Required by thanos 91 "--storage.tsdb.min-block-duration=5s" 92 "--storage.tsdb.max-block-duration=5s" 93 ]; 94 }; 95 services.prometheus.pushgateway = { 96 enable = true; 97 web.listen-address = ":${toString pushgwPort}"; 98 persistMetrics = true; 99 persistence.interval = "1s"; 100 stateDir = "prometheus-pushgateway"; 101 }; 102 services.thanos = { 103 sidecar = { 104 enable = true; 105 grpc-address = "0.0.0.0:${toString grpcPort}"; 106 inherit objstore; 107 }; 108 109 # TODO: Add some tests for these services: 110 #rule = { 111 # enable = true; 112 # http-address = "0.0.0.0:19194"; 113 # grpc-address = "0.0.0.0:19193"; 114 # query.addresses = [ 115 # "localhost:19191" 116 # ]; 117 # labels = { 118 # just = "some"; 119 # nice = "labels"; 120 # }; 121 #}; 122 # 123 #receive = { 124 # http-address = "0.0.0.0:19195"; 125 # enable = true; 126 # labels = { 127 # just = "some"; 128 # nice = "labels"; 129 # }; 130 #}; 131 }; 132 # Adds a "specialisation" of the above config which allows us to 133 # "switch" to it and see if the services.prometheus.enableReload 134 # functionality actually reloads the prometheus service instead of 135 # restarting it. 136 specialisation = { 137 "prometheus-config-change" = { 138 configuration = { 139 environment.systemPackages = [ pkgs.yq ]; 140 141 # This configuration just adds a new prometheus job 142 # to scrape the node_exporter metrics of the s3 machine. 143 services.prometheus = { 144 scrapeConfigs = [ 145 { 146 job_name = "s3-node_exporter"; 147 static_configs = [ 148 { 149 targets = [ "s3:9100" ]; 150 } 151 ]; 152 } 153 ]; 154 }; 155 }; 156 }; 157 }; 158 }; 159 160 query = 161 { pkgs, ... }: 162 { 163 environment.systemPackages = [ pkgs.jq ]; 164 services.thanos.query = { 165 enable = true; 166 http-address = "0.0.0.0:${toString queryPort}"; 167 endpoints = [ 168 "prometheus:${toString grpcPort}" 169 ]; 170 }; 171 services.thanos.query-frontend = { 172 enable = true; 173 http-address = "0.0.0.0:${toString frontPort}"; 174 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}"; 175 }; 176 }; 177 178 store = 179 { pkgs, ... }: 180 { 181 virtualisation.diskSize = 2 * 1024; 182 virtualisation.memorySize = 2048; 183 environment.systemPackages = with pkgs; [ 184 grpc-health-probe 185 jq 186 thanos 187 ]; 188 services.thanos.store = { 189 enable = true; 190 http-address = "0.0.0.0:10902"; 191 grpc-address = "0.0.0.0:${toString grpcPort}"; 192 inherit objstore; 193 sync-block-duration = "1s"; 194 }; 195 services.thanos.compact = { 196 enable = true; 197 http-address = "0.0.0.0:10903"; 198 inherit objstore; 199 consistency-delay = "5s"; 200 }; 201 services.thanos.query = { 202 enable = true; 203 http-address = "0.0.0.0:${toString queryPort}"; 204 endpoints = [ 205 "localhost:${toString grpcPort}" 206 ]; 207 }; 208 }; 209 210 s3 = 211 { pkgs, ... }: 212 { 213 # Minio requires at least 1GiB of free disk space to run. 214 virtualisation = { 215 diskSize = 2 * 1024; 216 }; 217 networking.firewall.allowedTCPPorts = [ minioPort ]; 218 219 services.minio = { 220 enable = true; 221 inherit (s3) accessKey secretKey; 222 }; 223 224 environment.systemPackages = [ pkgs.minio-client ]; 225 226 services.prometheus.exporters.node = { 227 enable = true; 228 openFirewall = true; 229 }; 230 }; 231 }; 232 233 testScript = 234 { nodes, ... }: 235 '' 236 # Before starting the other machines we first make sure that our S3 service is online 237 # and has a bucket added for thanos: 238 s3.start() 239 s3.wait_for_unit("minio.service") 240 s3.wait_for_open_port(${toString minioPort}) 241 s3.succeed( 242 "mc alias set minio " 243 + "http://localhost:${toString minioPort} " 244 + "${s3.accessKey} ${s3.secretKey} --api s3v4", 245 "mc mb minio/thanos-bucket", 246 ) 247 248 # Now that s3 has started we can start the other machines: 249 for machine in prometheus, query, store: 250 machine.start() 251 252 # Check if prometheus responds to requests: 253 prometheus.wait_for_unit("prometheus.service") 254 255 prometheus.wait_for_open_port(${toString queryPort}) 256 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics") 257 258 prometheus.wait_until_succeeds("journalctl -o cat -u thanos-sidecar.service | grep 'listening for serving gRPC'") 259 260 store.wait_until_succeeds("journalctl -o cat -u thanos-store.service | grep 'listening for serving gRPC'") 261 262 for machine in prometheus, store: 263 machine.wait_until_succeeds("grpc-health-probe -addr 127.0.0.1:${toString grpcPort}") 264 265 # Let's test if pushing a metric to the pushgateway succeeds: 266 prometheus.wait_for_unit("pushgateway.service") 267 prometheus.succeed( 268 "echo 'some_metric 3.14' | " 269 + "curl -f --data-binary \@- " 270 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job" 271 ) 272 273 # Now check whether that metric gets ingested by prometheus. 274 # Since we'll check for the metric several times on different machines 275 # we abstract the test using the following function: 276 277 # Function to check if the metric "some_metric" has been received and returns the correct value. 278 def wait_for_metric(machine): 279 return machine.wait_until_succeeds( 280 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | " 281 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'" 282 ) 283 284 285 wait_for_metric(prometheus) 286 287 # Let's test if the pushgateway persists metrics to the configured location. 288 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics") 289 290 # Test thanos 291 prometheus.wait_for_unit("thanos-sidecar.service") 292 293 # Test if the Thanos query service can correctly retrieve the metric that was send above. 294 query.wait_for_unit("thanos-query.service") 295 wait_for_metric(query) 296 297 # Test Thanos query frontend service 298 query.wait_for_unit("thanos-query-frontend.service") 299 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy") 300 301 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the 302 # Thanos storage service has correctly downloaded it from S3 and if the Thanos 303 # query service running on $store can correctly retrieve the metric: 304 store.wait_for_unit("thanos-store.service") 305 wait_for_metric(store) 306 307 store.wait_for_unit("thanos-compact.service") 308 309 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket 310 # and check if the blocks have the correct labels: 311 store.succeed( 312 "thanos tools bucket ls " 313 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} " 314 + "--output=json | " 315 + "jq .thanos.labels.some_label | " 316 + "grep 'required by thanos'" 317 ) 318 ''; 319}