at 24.11-pre 11 kB view raw
1let 2 grpcPort = 19090; 3 queryPort = 9090; 4 minioPort = 9000; 5 pushgwPort = 9091; 6 frontPort = 9092; 7 8 s3 = { 9 accessKey = "BKIKJAA5BMMU2RHO6IBB"; 10 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12"; 11 }; 12 13 objstore.config = { 14 type = "S3"; 15 config = { 16 bucket = "thanos-bucket"; 17 endpoint = "s3:${toString minioPort}"; 18 region = "us-east-1"; 19 access_key = s3.accessKey; 20 secret_key = s3.secretKey; 21 insecure = true; 22 signature_version2 = false; 23 put_user_metadata = {}; 24 http_config = { 25 idle_conn_timeout = "0s"; 26 insecure_skip_verify = false; 27 }; 28 trace = { 29 enable = false; 30 }; 31 }; 32 }; 33 34in import ./make-test-python.nix { 35 name = "prometheus"; 36 37 nodes = { 38 prometheus = { pkgs, ... }: { 39 virtualisation.diskSize = 2 * 1024; 40 virtualisation.memorySize = 2048; 41 environment.systemPackages = [ pkgs.jq ]; 42 networking.firewall.allowedTCPPorts = [ grpcPort ]; 43 services.prometheus = { 44 enable = true; 45 enableReload = true; 46 scrapeConfigs = [ 47 { 48 job_name = "prometheus"; 49 static_configs = [ 50 { 51 targets = [ "127.0.0.1:${toString queryPort}" ]; 52 labels = { instance = "localhost"; }; 53 } 54 ]; 55 } 56 { 57 job_name = "pushgateway"; 58 scrape_interval = "1s"; 59 static_configs = [ 60 { 61 targets = [ "127.0.0.1:${toString pushgwPort}" ]; 62 } 63 ]; 64 } 65 ]; 66 rules = [ 67 '' 68 groups: 69 - name: test 70 rules: 71 - record: testrule 72 expr: count(up{job="prometheus"}) 73 '' 74 ]; 75 globalConfig = { 76 external_labels = { 77 some_label = "required by thanos"; 78 }; 79 }; 80 extraFlags = [ 81 # Required by thanos 82 "--storage.tsdb.min-block-duration=5s" 83 "--storage.tsdb.max-block-duration=5s" 84 ]; 85 }; 86 services.prometheus.pushgateway = { 87 enable = true; 88 web.listen-address = ":${toString pushgwPort}"; 89 persistMetrics = true; 90 persistence.interval = "1s"; 91 stateDir = "prometheus-pushgateway"; 92 }; 93 services.thanos = { 94 sidecar = { 95 enable = true; 96 grpc-address = "0.0.0.0:${toString grpcPort}"; 97 inherit objstore; 98 }; 99 100 # TODO: Add some tests for these services: 101 #rule = { 102 # enable = true; 103 # http-address = "0.0.0.0:19194"; 104 # grpc-address = "0.0.0.0:19193"; 105 # query.addresses = [ 106 # "localhost:19191" 107 # ]; 108 # labels = { 109 # just = "some"; 110 # nice = "labels"; 111 # }; 112 #}; 113 # 114 #receive = { 115 # http-address = "0.0.0.0:19195"; 116 # enable = true; 117 # labels = { 118 # just = "some"; 119 # nice = "labels"; 120 # }; 121 #}; 122 }; 123 # Adds a "specialisation" of the above config which allows us to 124 # "switch" to it and see if the services.prometheus.enableReload 125 # functionality actually reloads the prometheus service instead of 126 # restarting it. 127 specialisation = { 128 "prometheus-config-change" = { 129 configuration = { 130 environment.systemPackages = [ pkgs.yq ]; 131 132 # This configuration just adds a new prometheus job 133 # to scrape the node_exporter metrics of the s3 machine. 134 services.prometheus = { 135 scrapeConfigs = [ 136 { 137 job_name = "s3-node_exporter"; 138 static_configs = [ 139 { 140 targets = [ "s3:9100" ]; 141 } 142 ]; 143 } 144 ]; 145 }; 146 }; 147 }; 148 }; 149 }; 150 151 query = { pkgs, ... }: { 152 environment.systemPackages = [ pkgs.jq ]; 153 services.thanos.query = { 154 enable = true; 155 http-address = "0.0.0.0:${toString queryPort}"; 156 endpoints = [ 157 "prometheus:${toString grpcPort}" 158 ]; 159 }; 160 services.thanos.query-frontend = { 161 enable = true; 162 http-address = "0.0.0.0:${toString frontPort}"; 163 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}"; 164 }; 165 }; 166 167 store = { pkgs, ... }: { 168 virtualisation.diskSize = 2 * 1024; 169 virtualisation.memorySize = 2048; 170 environment.systemPackages = with pkgs; [ jq thanos ]; 171 services.thanos.store = { 172 enable = true; 173 http-address = "0.0.0.0:10902"; 174 grpc-address = "0.0.0.0:${toString grpcPort}"; 175 inherit objstore; 176 sync-block-duration = "1s"; 177 }; 178 services.thanos.compact = { 179 enable = true; 180 http-address = "0.0.0.0:10903"; 181 inherit objstore; 182 consistency-delay = "5s"; 183 }; 184 services.thanos.query = { 185 enable = true; 186 http-address = "0.0.0.0:${toString queryPort}"; 187 endpoints = [ 188 "localhost:${toString grpcPort}" 189 ]; 190 }; 191 }; 192 193 s3 = { pkgs, ... } : { 194 # Minio requires at least 1GiB of free disk space to run. 195 virtualisation = { 196 diskSize = 2 * 1024; 197 }; 198 networking.firewall.allowedTCPPorts = [ minioPort ]; 199 200 services.minio = { 201 enable = true; 202 inherit (s3) accessKey secretKey; 203 }; 204 205 environment.systemPackages = [ pkgs.minio-client ]; 206 207 services.prometheus.exporters.node = { 208 enable = true; 209 openFirewall = true; 210 }; 211 }; 212 }; 213 214 testScript = { nodes, ... } : '' 215 import json 216 217 # Before starting the other machines we first make sure that our S3 service is online 218 # and has a bucket added for thanos: 219 s3.start() 220 s3.wait_for_unit("minio.service") 221 s3.wait_for_open_port(${toString minioPort}) 222 s3.succeed( 223 "mc config host add minio " 224 + "http://localhost:${toString minioPort} " 225 + "${s3.accessKey} ${s3.secretKey} --api s3v4", 226 "mc mb minio/thanos-bucket", 227 ) 228 229 # Now that s3 has started we can start the other machines: 230 for machine in prometheus, query, store: 231 machine.start() 232 233 # Check if prometheus responds to requests: 234 prometheus.wait_for_unit("prometheus.service") 235 236 prometheus.wait_for_open_port(${toString queryPort}) 237 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics") 238 239 # Let's test if pushing a metric to the pushgateway succeeds: 240 prometheus.wait_for_unit("pushgateway.service") 241 prometheus.succeed( 242 "echo 'some_metric 3.14' | " 243 + "curl -f --data-binary \@- " 244 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job" 245 ) 246 247 # Now check whether that metric gets ingested by prometheus. 248 # Since we'll check for the metric several times on different machines 249 # we abstract the test using the following function: 250 251 # Function to check if the metric "some_metric" has been received and returns the correct value. 252 def wait_for_metric(machine): 253 return machine.wait_until_succeeds( 254 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | " 255 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'" 256 ) 257 258 259 wait_for_metric(prometheus) 260 261 # Let's test if the pushgateway persists metrics to the configured location. 262 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics") 263 264 # Test thanos 265 prometheus.wait_for_unit("thanos-sidecar.service") 266 267 # Test if the Thanos query service can correctly retrieve the metric that was send above. 268 query.wait_for_unit("thanos-query.service") 269 wait_for_metric(query) 270 271 # Test Thanos query frontend service 272 query.wait_for_unit("thanos-query-frontend.service") 273 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy") 274 275 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the 276 # Thanos storage service has correctly downloaded it from S3 and if the Thanos 277 # query service running on $store can correctly retrieve the metric: 278 store.wait_for_unit("thanos-store.service") 279 wait_for_metric(store) 280 281 store.wait_for_unit("thanos-compact.service") 282 283 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket 284 # and check if the blocks have the correct labels: 285 store.succeed( 286 "thanos tools bucket ls " 287 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} " 288 + "--output=json | " 289 + "jq .thanos.labels.some_label | " 290 + "grep 'required by thanos'" 291 ) 292 293 # Check if switching to a NixOS configuration that changes the prometheus 294 # configuration reloads (instead of restarts) prometheus before the switch 295 # finishes successfully: 296 with subtest("config change reloads prometheus"): 297 # We check if prometheus has finished reloading by looking for the message 298 # "Completed loading of configuration file" in the journal between the start 299 # and finish of switching to the new NixOS configuration. 300 # 301 # To mark the start we record the journal cursor before starting the switch: 302 cursor_before_switching = json.loads( 303 prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR") 304 )["__CURSOR"] 305 306 # Now we switch: 307 prometheus_config_change = prometheus.succeed( 308 "readlink /run/current-system/specialisation/prometheus-config-change" 309 ).strip() 310 prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test") 311 312 # Next we retrieve all logs since the start of switching: 313 logs_after_starting_switching = prometheus.succeed( 314 """ 315 journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE 316 """.format( 317 cursor_before_switching=cursor_before_switching 318 ) 319 ) 320 321 # Finally we check if the message "Completed loading of configuration file" 322 # occurs before the "finished switching to system configuration" message: 323 finished_switching_msg = ( 324 "finished switching to system configuration " + prometheus_config_change 325 ) 326 reloaded_before_switching_finished = False 327 finished_switching = False 328 for log_line in logs_after_starting_switching.split("\n"): 329 msg = json.loads(log_line)["MESSAGE"] 330 if "Completed loading of configuration file" in msg: 331 reloaded_before_switching_finished = True 332 if msg == finished_switching_msg: 333 finished_switching = True 334 break 335 336 assert reloaded_before_switching_finished 337 assert finished_switching 338 339 # Check if the reloaded config includes the new s3-node_exporter job: 340 prometheus.succeed( 341 """ 342 curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \ 343 | jq -r .data.yaml \ 344 | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \ 345 | grep true 346 """ 347 ) 348 ''; 349}