nixos/tests/web-servers/llama-swap.nix at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / nixos / tests / web-servers / llama-swap.nix
at master 9.4 kB view raw
  1{ pkgs, lib, ... }:
  2
  3let
  4  wrapSrc = attrs: pkgs.runCommand "${attrs.pname}-${attrs.version}" attrs "ln -s $src $out";
  5
  6  smollm2-135m = wrapSrc rec {
  7    pname = "smollm2-135m";
  8    version = "9e6855bc4be717fca1ef21360a1db4b29d5c559a";
  9    src = pkgs.fetchurl {
 10      url = "https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF/resolve/${version}/SmolLM2-135M-Instruct-Q4_K_M.gguf";
 11      hash = "sha256-7V+jDEh7KC7BVsKQYvEiLlwgh1qUSsmCidvSQulH90c=";
 12    };
 13
 14    meta.license = with lib.licenses; [
 15      asl20 # actual license of the model
 16      unfree # to force an opt-in - do not remove
 17    ];
 18  };
 19
 20  # grab allowUnfreePredicate if it exists or default deny
 21  allowUnfreePredicate =
 22    if builtins.hasAttr "allowUnfreePredicate" pkgs.config then
 23      pkgs.config.allowUnfreePredicate
 24    else
 25      (_: false);
 26
 27  # check if we can use smollm2-135m taking either globally allowUnfree or
 28  # explicit allow with predicate
 29  useSmollm2-135m = pkgs.config.allowUnfree || allowUnfreePredicate smollm2-135m;
 30in
 31{
 32  name = "llama-swap";
 33  meta.maintainers = with lib.maintainers; [
 34    jk
 35    podium868909
 36  ];
 37
 38  nodes = {
 39    machine =
 40      { pkgs, ... }:
 41      {
 42        # running models can be memory intensive but
 43        # default `virtualisation.memorySize` is fine
 44
 45        services.llama-swap = {
 46          enable = true;
 47          settings =
 48            # config for basic tests
 49            if !useSmollm2-135m then
 50              { }
 51            # config for extended tests using SmolLM2
 52            else
 53              let
 54                llama-cpp = pkgs.llama-cpp;
 55                llama-server = lib.getExe' llama-cpp "llama-server";
 56              in
 57              {
 58                hooks.on_startup.preload = [
 59                  "smollm2"
 60                ];
 61                # temperature and top-k important for SmolLM2 performance/accuracy
 62                models = {
 63                  "smollm2" = {
 64                    ttl = 10;
 65                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9";
 66                  };
 67                  "smollm2-group-1" = {
 68                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
 69                  };
 70                  "smollm2-group-2" = {
 71                    proxy = "http://127.0.0.1:5802";
 72                    cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
 73                  };
 74                };
 75                groups = {
 76                  "standalone" = {
 77                    swap = true;
 78                    exclusive = true;
 79                    members = [
 80                      "smollm2"
 81                    ];
 82                  };
 83                  "group" = {
 84                    swap = false;
 85                    exclusive = true;
 86                    members = [
 87                      "smollm2-group-1"
 88                      "smollm2-group-2"
 89                    ];
 90                  };
 91                };
 92              };
 93        };
 94      };
 95  };
 96
 97  testScript =
 98    { nodes, ... }:
 99    ''
100      # core tests
101      import json
102
103      def get_json(route):
104        args = [
105          '-v',
106          '-s',
107          '--fail',
108          '-H "Content-Type: application/json"'
109        ]
110        return json.loads(machine.succeed("curl {args} http://localhost:8080{route}".format(args=" ".join(args), route=route)))
111
112      def post_json(route, data):
113        args = [
114          '-v',
115          '-s',
116          '--fail',
117          '-H "Content-Type: application/json"',
118          '-H "Authorization: Bearer no-key"',
119          "-d '{d}'".format(d=json.dumps(data))
120        ]
121        return json.loads(machine.succeed('curl {args} http://localhost:8080{route}'.format(args=" ".join(args), route=route)))
122
123      machine.wait_for_unit('llama-swap')
124      machine.wait_for_open_port(8080)
125
126      with subtest('check is serving ui'):
127        machine.succeed('curl --fail http:/localhost:8080/ui/')
128
129      with subtest('check is healthy'):
130        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health | grep "OK"')
131
132    ''
133    + lib.optionalString useSmollm2-135m ''
134      # extended tests using SmolLM2
135      with subtest('check `/running` for preloaded smollm2'):
136        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
137        running_response = get_json('/running')
138        assert len(running_response['running']) == 1
139        running_model = running_response['running'][0]
140        assert running_model['model'] == 'smollm2'
141        assert running_model['state'] == 'ready'
142
143      with subtest('runs smollm2'):
144        response = None
145        with subtest('send request to smollm2'):
146          data = {
147            'model': 'smollm2',
148            'messages': [
149              {
150                'role': 'user',
151                'content': 'Say hello'
152              }
153            ]
154          }
155          response = post_json('/v1/chat/completions', data)
156
157        with subtest('response is from smollm2'):
158          assert response['model'] == 'smollm2'
159
160        with subtest('response contains at least one item in "choices"'):
161          assert len(response['choices']) >= 1
162
163        assistant_choices = None
164        with subtest('response contains at least one "assistant" message'):
165          assistant_choices = [c for c in response['choices'] if c['message']['role'] == 'assistant']
166          assert len(assistant_choices) >= 1
167
168        with subtest('first message (lowercase) starts with "hello"'):
169          assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello'
170
171        with subtest('check `/running` for just smollm2'):
172          running_response = get_json('/running')
173          assert len(running_response['running']) == 1
174          running_model = running_response['running'][0]
175          assert running_model['model'] == 'smollm2'
176          assert running_model['state'] == 'ready'
177
178      with subtest('check `/running` for smollm2 to timeout'):
179        machine.succeed('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
180        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep -v "smollm2"', timeout=11)
181        running_response = get_json('/running')
182        assert len(running_response['running']) == 0
183
184      with subtest('runs smollm2-group-1 and smollm2-group-2'):
185        response_1 = None
186        with subtest('send request to smollm2-group-1'):
187          data = {
188            'model': 'smollm2-group-1',
189            'messages': [
190              {
191                'role': 'user',
192                'content': 'Say hello'
193              }
194            ]
195          }
196          response_1 = post_json('/v1/chat/completions', data)
197
198        with subtest('response 1 is from smollm2-group-1'):
199          assert response_1['model'] == 'smollm2-group-1'
200
201        with subtest('response 1 contains at least one item in "choices"'):
202          assert len(response['choices']) >= 1
203
204        assistant_choices_1 = None
205        with subtest('response 1 contains at least one "assistant" message'):
206          assistant_choices_1 = [c for c in response_1['choices'] if c['message']['role'] == 'assistant']
207          assert len(assistant_choices_1) >= 1
208
209        with subtest('first message (lowercase) in response 1 starts with "hello"'):
210          assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello'
211
212        with subtest('check `/running` for just smollm2-group-1'):
213          running_response = get_json('/running')
214          assert len(running_response['running']) == 1
215          running_model = running_response['running'][0]
216          assert running_model['model'] == 'smollm2-group-1'
217          assert running_model['state'] == 'ready'
218
219        response_2 = None
220        with subtest('send request to smollm2-group-2'):
221          data = {
222            'model': 'smollm2-group-2',
223            'messages': [
224              {
225                'role': 'user',
226                'content': 'Say hello'
227              }
228            ]
229          }
230          response_2 = post_json('/v1/chat/completions', data)
231
232        with subtest('response 2 is from smollm2-group-2'):
233          assert response_2['model'] == 'smollm2-group-2'
234
235        with subtest('response 2 contains at least one item in "choices"'):
236          assert len(response['choices']) >= 1
237
238        assistant_choices_2 = None
239        with subtest('response 2 contains at least one "assistant" message'):
240          assistant_choices_2 = [c for c in response_2['choices'] if c['message']['role'] == 'assistant']
241          assert len(assistant_choices_2) >= 1
242
243        with subtest('first message (lowercase) in response 1 starts with "hello"'):
244          assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello'
245
246        with subtest('check `/running` for both smollm2-group-1 and smollm2-group-2'):
247          running_response = get_json('/running')['running']
248          assert len(running_response) == 2
249          assert len([
250            rm for rm in running_response
251            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-1'
252          ]) == 1
253          assert len([
254            rm for rm in running_response
255            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-2'
256          ]) == 1
257    '';
258}