at 25.11-pre 9.4 kB view raw
1let 2 grpcPort = 19090; 3 queryPort = 9090; 4 minioPort = 9000; 5 pushgwPort = 9091; 6 frontPort = 9092; 7 8 s3 = { 9 accessKey = "BKIKJAA5BMMU2RHO6IBB"; 10 secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12"; 11 }; 12 13 objstore.config = { 14 type = "S3"; 15 config = { 16 bucket = "thanos-bucket"; 17 endpoint = "s3:${toString minioPort}"; 18 region = "us-east-1"; 19 access_key = s3.accessKey; 20 secret_key = s3.secretKey; 21 insecure = true; 22 signature_version2 = false; 23 put_user_metadata = { }; 24 http_config = { 25 idle_conn_timeout = "0s"; 26 insecure_skip_verify = false; 27 }; 28 trace = { 29 enable = false; 30 }; 31 }; 32 }; 33 34in 35import ./make-test-python.nix { 36 name = "prometheus"; 37 38 nodes = { 39 prometheus = 40 { pkgs, ... }: 41 { 42 virtualisation.diskSize = 2 * 1024; 43 virtualisation.memorySize = 2048; 44 environment.systemPackages = [ pkgs.jq ]; 45 networking.firewall.allowedTCPPorts = [ grpcPort ]; 46 services.prometheus = { 47 enable = true; 48 enableReload = true; 49 scrapeConfigs = [ 50 { 51 job_name = "prometheus"; 52 static_configs = [ 53 { 54 targets = [ "127.0.0.1:${toString queryPort}" ]; 55 labels = { 56 instance = "localhost"; 57 }; 58 } 59 ]; 60 } 61 { 62 job_name = "pushgateway"; 63 scrape_interval = "1s"; 64 static_configs = [ 65 { 66 targets = [ "127.0.0.1:${toString pushgwPort}" ]; 67 } 68 ]; 69 } 70 ]; 71 rules = [ 72 '' 73 groups: 74 - name: test 75 rules: 76 - record: testrule 77 expr: count(up{job="prometheus"}) 78 '' 79 ]; 80 globalConfig = { 81 external_labels = { 82 some_label = "required by thanos"; 83 }; 84 }; 85 extraFlags = [ 86 # Required by thanos 87 "--storage.tsdb.min-block-duration=5s" 88 "--storage.tsdb.max-block-duration=5s" 89 ]; 90 }; 91 services.prometheus.pushgateway = { 92 enable = true; 93 web.listen-address = ":${toString pushgwPort}"; 94 persistMetrics = true; 95 persistence.interval = "1s"; 96 stateDir = "prometheus-pushgateway"; 97 }; 98 services.thanos = { 99 sidecar = { 100 enable = true; 101 grpc-address = "0.0.0.0:${toString grpcPort}"; 102 inherit objstore; 103 }; 104 105 # TODO: Add some tests for these services: 106 #rule = { 107 # enable = true; 108 # http-address = "0.0.0.0:19194"; 109 # grpc-address = "0.0.0.0:19193"; 110 # query.addresses = [ 111 # "localhost:19191" 112 # ]; 113 # labels = { 114 # just = "some"; 115 # nice = "labels"; 116 # }; 117 #}; 118 # 119 #receive = { 120 # http-address = "0.0.0.0:19195"; 121 # enable = true; 122 # labels = { 123 # just = "some"; 124 # nice = "labels"; 125 # }; 126 #}; 127 }; 128 # Adds a "specialisation" of the above config which allows us to 129 # "switch" to it and see if the services.prometheus.enableReload 130 # functionality actually reloads the prometheus service instead of 131 # restarting it. 132 specialisation = { 133 "prometheus-config-change" = { 134 configuration = { 135 environment.systemPackages = [ pkgs.yq ]; 136 137 # This configuration just adds a new prometheus job 138 # to scrape the node_exporter metrics of the s3 machine. 139 services.prometheus = { 140 scrapeConfigs = [ 141 { 142 job_name = "s3-node_exporter"; 143 static_configs = [ 144 { 145 targets = [ "s3:9100" ]; 146 } 147 ]; 148 } 149 ]; 150 }; 151 }; 152 }; 153 }; 154 }; 155 156 query = 157 { pkgs, ... }: 158 { 159 environment.systemPackages = [ pkgs.jq ]; 160 services.thanos.query = { 161 enable = true; 162 http-address = "0.0.0.0:${toString queryPort}"; 163 endpoints = [ 164 "prometheus:${toString grpcPort}" 165 ]; 166 }; 167 services.thanos.query-frontend = { 168 enable = true; 169 http-address = "0.0.0.0:${toString frontPort}"; 170 query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}"; 171 }; 172 }; 173 174 store = 175 { pkgs, ... }: 176 { 177 virtualisation.diskSize = 2 * 1024; 178 virtualisation.memorySize = 2048; 179 environment.systemPackages = with pkgs; [ 180 jq 181 thanos 182 ]; 183 services.thanos.store = { 184 enable = true; 185 http-address = "0.0.0.0:10902"; 186 grpc-address = "0.0.0.0:${toString grpcPort}"; 187 inherit objstore; 188 sync-block-duration = "1s"; 189 }; 190 services.thanos.compact = { 191 enable = true; 192 http-address = "0.0.0.0:10903"; 193 inherit objstore; 194 consistency-delay = "5s"; 195 }; 196 services.thanos.query = { 197 enable = true; 198 http-address = "0.0.0.0:${toString queryPort}"; 199 endpoints = [ 200 "localhost:${toString grpcPort}" 201 ]; 202 }; 203 }; 204 205 s3 = 206 { pkgs, ... }: 207 { 208 # Minio requires at least 1GiB of free disk space to run. 209 virtualisation = { 210 diskSize = 2 * 1024; 211 }; 212 networking.firewall.allowedTCPPorts = [ minioPort ]; 213 214 services.minio = { 215 enable = true; 216 inherit (s3) accessKey secretKey; 217 }; 218 219 environment.systemPackages = [ pkgs.minio-client ]; 220 221 services.prometheus.exporters.node = { 222 enable = true; 223 openFirewall = true; 224 }; 225 }; 226 }; 227 228 testScript = 229 { nodes, ... }: 230 '' 231 # Before starting the other machines we first make sure that our S3 service is online 232 # and has a bucket added for thanos: 233 s3.start() 234 s3.wait_for_unit("minio.service") 235 s3.wait_for_open_port(${toString minioPort}) 236 s3.succeed( 237 "mc config host add minio " 238 + "http://localhost:${toString minioPort} " 239 + "${s3.accessKey} ${s3.secretKey} --api s3v4", 240 "mc mb minio/thanos-bucket", 241 ) 242 243 # Now that s3 has started we can start the other machines: 244 for machine in prometheus, query, store: 245 machine.start() 246 247 # Check if prometheus responds to requests: 248 prometheus.wait_for_unit("prometheus.service") 249 250 prometheus.wait_for_open_port(${toString queryPort}) 251 prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics") 252 253 # Let's test if pushing a metric to the pushgateway succeeds: 254 prometheus.wait_for_unit("pushgateway.service") 255 prometheus.succeed( 256 "echo 'some_metric 3.14' | " 257 + "curl -f --data-binary \@- " 258 + "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job" 259 ) 260 261 # Now check whether that metric gets ingested by prometheus. 262 # Since we'll check for the metric several times on different machines 263 # we abstract the test using the following function: 264 265 # Function to check if the metric "some_metric" has been received and returns the correct value. 266 def wait_for_metric(machine): 267 return machine.wait_until_succeeds( 268 "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | " 269 + "jq '.data.result[0].value[1]' | grep '\"3.14\"'" 270 ) 271 272 273 wait_for_metric(prometheus) 274 275 # Let's test if the pushgateway persists metrics to the configured location. 276 prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics") 277 278 # Test thanos 279 prometheus.wait_for_unit("thanos-sidecar.service") 280 281 # Test if the Thanos query service can correctly retrieve the metric that was send above. 282 query.wait_for_unit("thanos-query.service") 283 wait_for_metric(query) 284 285 # Test Thanos query frontend service 286 query.wait_for_unit("thanos-query-frontend.service") 287 query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy") 288 289 # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the 290 # Thanos storage service has correctly downloaded it from S3 and if the Thanos 291 # query service running on $store can correctly retrieve the metric: 292 store.wait_for_unit("thanos-store.service") 293 wait_for_metric(store) 294 295 store.wait_for_unit("thanos-compact.service") 296 297 # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket 298 # and check if the blocks have the correct labels: 299 store.succeed( 300 "thanos tools bucket ls " 301 + "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} " 302 + "--output=json | " 303 + "jq .thanos.labels.some_label | " 304 + "grep 'required by thanos'" 305 ) 306 ''; 307}