at 23.05-pre 11 kB view raw
1let 2 grpcPort = 19090; 3 queryPort = 9090; 4 minioPort = 9000; 5 pushgwPort = 9091; 6 7 s3 = { 8 accessKey = "BKIKJAA5BMMU2RHO6IBB"; 9 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12"; 10 }; 11 12 objstore.config = { 13 type = "S3"; 14 config = { 15 bucket = "thanos-bucket"; 16 endpoint = "s3:${toString minioPort}"; 17 region = "us-east-1"; 18 access_key = s3.accessKey; 19 secret_key = s3.secretKey; 20 insecure = true; 21 signature_version2 = false; 22 put_user_metadata = {}; 23 http_config = { 24 idle_conn_timeout = "0s"; 25 insecure_skip_verify = false; 26 }; 27 trace = { 28 enable = false; 29 }; 30 }; 31 }; 32 33in import ./make-test-python.nix { 34 name = "prometheus"; 35 36 nodes = { 37 prometheus = { pkgs, ... }: { 38 virtualisation.diskSize = 2 * 1024; 39 virtualisation.memorySize = 2048; 40 environment.systemPackages = [ pkgs.jq ]; 41 networking.firewall.allowedTCPPorts = [ grpcPort ]; 42 services.prometheus = { 43 enable = true; 44 enableReload = true; 45 scrapeConfigs = [ 46 { 47 job_name = "prometheus"; 48 static_configs = [ 49 { 50 targets = [ "127.0.0.1:${toString queryPort}" ]; 51 labels = { instance = "localhost"; }; 52 } 53 ]; 54 } 55 { 56 job_name = "pushgateway"; 57 scrape_interval = "1s"; 58 static_configs = [ 59 { 60 targets = [ "127.0.0.1:${toString pushgwPort}" ]; 61 } 62 ]; 63 } 64 ]; 65 rules = [ 66 '' 67 groups: 68 - name: test 69 rules: 70 - record: testrule 71 expr: count(up{job="prometheus"}) 72 '' 73 ]; 74 globalConfig = { 75 external_labels = { 76 some_label = "required by thanos"; 77 }; 78 }; 79 extraFlags = [ 80 # Required by thanos 81 "--storage.tsdb.min-block-duration=5s" 82 "--storage.tsdb.max-block-duration=5s" 83 ]; 84 }; 85 services.prometheus.pushgateway = { 86 enable = true; 87 web.listen-address = ":${toString pushgwPort}"; 88 persistMetrics = true; 89 persistence.interval = "1s"; 90 stateDir = "prometheus-pushgateway"; 91 }; 92 services.thanos = { 93 sidecar = { 94 enable = true; 95 grpc-address = "0.0.0.0:${toString grpcPort}"; 96 inherit objstore; 97 }; 98 99 # TODO: Add some tests for these services: 100 #rule = { 101 # enable = true; 102 # http-address = "0.0.0.0:19194"; 103 # grpc-address = "0.0.0.0:19193"; 104 # query.addresses = [ 105 # "localhost:19191" 106 # ]; 107 # labels = { 108 # just = "some"; 109 # nice = "labels"; 110 # }; 111 #}; 112 # 113 #receive = { 114 # http-address = "0.0.0.0:19195"; 115 # enable = true; 116 # labels = { 117 # just = "some"; 118 # nice = "labels"; 119 # }; 120 #}; 121 }; 122 # Adds a "specialisation" of the above config which allows us to 123 # "switch" to it and see if the services.prometheus.enableReload 124 # functionality actually reloads the prometheus service instead of 125 # restarting it. 126 specialisation = { 127 "prometheus-config-change" = { 128 configuration = { 129 environment.systemPackages = [ pkgs.yq ]; 130 131 # This configuration just adds a new prometheus job 132 # to scrape the node_exporter metrics of the s3 machine. 133 services.prometheus = { 134 scrapeConfigs = [ 135 { 136 job_name = "s3-node_exporter"; 137 static_configs = [ 138 { 139 targets = [ "s3:9100" ]; 140 } 141 ]; 142 } 143 ]; 144 }; 145 }; 146 }; 147 }; 148 }; 149 150 query = { pkgs, ... }: { 151 environment.systemPackages = [ pkgs.jq ]; 152 services.thanos.query = { 153 enable = true; 154 http-address = "0.0.0.0:${toString queryPort}"; 155 store.addresses = [ 156 "prometheus:${toString grpcPort}" 157 ]; 158 }; 159 }; 160 161 store = { pkgs, ... }: { 162 virtualisation.diskSize = 2 * 1024; 163 virtualisation.memorySize = 2048; 164 environment.systemPackages = with pkgs; [ jq thanos ]; 165 services.thanos.store = { 166 enable = true; 167 http-address = "0.0.0.0:10902"; 168 grpc-address = "0.0.0.0:${toString grpcPort}"; 169 inherit objstore; 170 sync-block-duration = "1s"; 171 }; 172 services.thanos.compact = { 173 enable = true; 174 http-address = "0.0.0.0:10903"; 175 inherit objstore; 176 consistency-delay = "5s"; 177 }; 178 services.thanos.query = { 179 enable = true; 180 http-address = "0.0.0.0:${toString queryPort}"; 181 store.addresses = [ 182 "localhost:${toString grpcPort}" 183 ]; 184 }; 185 }; 186 187 s3 = { pkgs, ... } : { 188 # Minio requires at least 1GiB of free disk space to run. 189 virtualisation = { 190 diskSize = 2 * 1024; 191 }; 192 networking.firewall.allowedTCPPorts = [ minioPort ]; 193 194 services.minio = { 195 enable = true; 196 inherit (s3) accessKey secretKey; 197 }; 198 199 environment.systemPackages = [ pkgs.minio-client ]; 200 201 services.prometheus.exporters.node = { 202 enable = true; 203 openFirewall = true; 204 }; 205 }; 206 }; 207 208 testScript = { nodes, ... } : '' 209 import json 210 211 # Before starting the other machines we first make sure that our S3 service is online 212 # and has a bucket added for thanos: 213 s3.start() 214 s3.wait_for_unit("minio.service") 215 s3.wait_for_open_port(${toString minioPort}) 216 s3.succeed( 217 "mc config host add minio " 218 + "http://localhost:${toString minioPort} " 219 + "${s3.accessKey} ${s3.secretKey} --api s3v4", 220 "mc mb minio/thanos-bucket", 221 ) 222 223 # Now that s3 has started we can start the other machines: 224 for machine in prometheus, query, store: 225 machine.start() 226 227 # Check if prometheus responds to requests: 228 prometheus.wait_for_unit("prometheus.service") 229 230 prometheus.wait_for_open_port(${toString queryPort}) 231 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics") 232 233 # Let's test if pushing a metric to the pushgateway succeeds: 234 prometheus.wait_for_unit("pushgateway.service") 235 prometheus.succeed( 236 "echo 'some_metric 3.14' | " 237 + "curl -f --data-binary \@- " 238 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job" 239 ) 240 241 # Now check whether that metric gets ingested by prometheus. 242 # Since we'll check for the metric several times on different machines 243 # we abstract the test using the following function: 244 245 # Function to check if the metric "some_metric" has been received and returns the correct value. 246 def wait_for_metric(machine): 247 return machine.wait_until_succeeds( 248 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | " 249 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'" 250 ) 251 252 253 wait_for_metric(prometheus) 254 255 # Let's test if the pushgateway persists metrics to the configured location. 256 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics") 257 258 # Test thanos 259 prometheus.wait_for_unit("thanos-sidecar.service") 260 261 # Test if the Thanos query service can correctly retrieve the metric that was send above. 262 query.wait_for_unit("thanos-query.service") 263 wait_for_metric(query) 264 265 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the 266 # Thanos storage service has correctly downloaded it from S3 and if the Thanos 267 # query service running on $store can correctly retrieve the metric: 268 store.wait_for_unit("thanos-store.service") 269 wait_for_metric(store) 270 271 store.wait_for_unit("thanos-compact.service") 272 273 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket 274 # and check if the blocks have the correct labels: 275 store.succeed( 276 "thanos tools bucket ls " 277 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} " 278 + "--output=json | " 279 + "jq .thanos.labels.some_label | " 280 + "grep 'required by thanos'" 281 ) 282 283 # Check if switching to a NixOS configuration that changes the prometheus 284 # configuration reloads (instead of restarts) prometheus before the switch 285 # finishes successfully: 286 with subtest("config change reloads prometheus"): 287 # We check if prometheus has finished reloading by looking for the message 288 # "Completed loading of configuration file" in the journal between the start 289 # and finish of switching to the new NixOS configuration. 290 # 291 # To mark the start we record the journal cursor before starting the switch: 292 cursor_before_switching = json.loads( 293 prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR") 294 )["__CURSOR"] 295 296 # Now we switch: 297 prometheus_config_change = prometheus.succeed( 298 "readlink /run/current-system/specialisation/prometheus-config-change" 299 ).strip() 300 prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test") 301 302 # Next we retrieve all logs since the start of switching: 303 logs_after_starting_switching = prometheus.succeed( 304 """ 305 journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE 306 """.format( 307 cursor_before_switching=cursor_before_switching 308 ) 309 ) 310 311 # Finally we check if the message "Completed loading of configuration file" 312 # occurs before the "finished switching to system configuration" message: 313 finished_switching_msg = ( 314 "finished switching to system configuration " + prometheus_config_change 315 ) 316 reloaded_before_switching_finished = False 317 finished_switching = False 318 for log_line in logs_after_starting_switching.split("\n"): 319 msg = json.loads(log_line)["MESSAGE"] 320 if "Completed loading of configuration file" in msg: 321 reloaded_before_switching_finished = True 322 if msg == finished_switching_msg: 323 finished_switching = True 324 break 325 326 assert reloaded_before_switching_finished 327 assert finished_switching 328 329 # Check if the reloaded config includes the new s3-node_exporter job: 330 prometheus.succeed( 331 """ 332 curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \ 333 | jq -r .data.yaml \ 334 | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \ 335 | grep true 336 """ 337 ) 338 ''; 339}