at master 9.4 kB view raw
1{ pkgs, lib, ... }: 2 3let 4 wrapSrc = attrs: pkgs.runCommand "${attrs.pname}-${attrs.version}" attrs "ln -s $src $out"; 5 6 smollm2-135m = wrapSrc rec { 7 pname = "smollm2-135m"; 8 version = "9e6855bc4be717fca1ef21360a1db4b29d5c559a"; 9 src = pkgs.fetchurl { 10 url = "https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF/resolve/${version}/SmolLM2-135M-Instruct-Q4_K_M.gguf"; 11 hash = "sha256-7V+jDEh7KC7BVsKQYvEiLlwgh1qUSsmCidvSQulH90c="; 12 }; 13 14 meta.license = with lib.licenses; [ 15 asl20 # actual license of the model 16 unfree # to force an opt-in - do not remove 17 ]; 18 }; 19 20 # grab allowUnfreePredicate if it exists or default deny 21 allowUnfreePredicate = 22 if builtins.hasAttr "allowUnfreePredicate" pkgs.config then 23 pkgs.config.allowUnfreePredicate 24 else 25 (_: false); 26 27 # check if we can use smollm2-135m taking either globally allowUnfree or 28 # explicit allow with predicate 29 useSmollm2-135m = pkgs.config.allowUnfree || allowUnfreePredicate smollm2-135m; 30in 31{ 32 name = "llama-swap"; 33 meta.maintainers = with lib.maintainers; [ 34 jk 35 podium868909 36 ]; 37 38 nodes = { 39 machine = 40 { pkgs, ... }: 41 { 42 # running models can be memory intensive but 43 # default `virtualisation.memorySize` is fine 44 45 services.llama-swap = { 46 enable = true; 47 settings = 48 # config for basic tests 49 if !useSmollm2-135m then 50 { } 51 # config for extended tests using SmolLM2 52 else 53 let 54 llama-cpp = pkgs.llama-cpp; 55 llama-server = lib.getExe' llama-cpp "llama-server"; 56 in 57 { 58 hooks.on_startup.preload = [ 59 "smollm2" 60 ]; 61 # temperature and top-k important for SmolLM2 performance/accuracy 62 models = { 63 "smollm2" = { 64 ttl = 10; 65 cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9"; 66 }; 67 "smollm2-group-1" = { 68 cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024"; 69 }; 70 "smollm2-group-2" = { 71 proxy = "http://127.0.0.1:5802"; 72 cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024"; 73 }; 74 }; 75 groups = { 76 "standalone" = { 77 swap = true; 78 exclusive = true; 79 members = [ 80 "smollm2" 81 ]; 82 }; 83 "group" = { 84 swap = false; 85 exclusive = true; 86 members = [ 87 "smollm2-group-1" 88 "smollm2-group-2" 89 ]; 90 }; 91 }; 92 }; 93 }; 94 }; 95 }; 96 97 testScript = 98 { nodes, ... }: 99 '' 100 # core tests 101 import json 102 103 def get_json(route): 104 args = [ 105 '-v', 106 '-s', 107 '--fail', 108 '-H "Content-Type: application/json"' 109 ] 110 return json.loads(machine.succeed("curl {args} http://localhost:8080{route}".format(args=" ".join(args), route=route))) 111 112 def post_json(route, data): 113 args = [ 114 '-v', 115 '-s', 116 '--fail', 117 '-H "Content-Type: application/json"', 118 '-H "Authorization: Bearer no-key"', 119 "-d '{d}'".format(d=json.dumps(data)) 120 ] 121 return json.loads(machine.succeed('curl {args} http://localhost:8080{route}'.format(args=" ".join(args), route=route))) 122 123 machine.wait_for_unit('llama-swap') 124 machine.wait_for_open_port(8080) 125 126 with subtest('check is serving ui'): 127 machine.succeed('curl --fail http:/localhost:8080/ui/') 128 129 with subtest('check is healthy'): 130 machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health | grep "OK"') 131 132 '' 133 + lib.optionalString useSmollm2-135m '' 134 # extended tests using SmolLM2 135 with subtest('check `/running` for preloaded smollm2'): 136 machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep "smollm2"') 137 running_response = get_json('/running') 138 assert len(running_response['running']) == 1 139 running_model = running_response['running'][0] 140 assert running_model['model'] == 'smollm2' 141 assert running_model['state'] == 'ready' 142 143 with subtest('runs smollm2'): 144 response = None 145 with subtest('send request to smollm2'): 146 data = { 147 'model': 'smollm2', 148 'messages': [ 149 { 150 'role': 'user', 151 'content': 'Say hello' 152 } 153 ] 154 } 155 response = post_json('/v1/chat/completions', data) 156 157 with subtest('response is from smollm2'): 158 assert response['model'] == 'smollm2' 159 160 with subtest('response contains at least one item in "choices"'): 161 assert len(response['choices']) >= 1 162 163 assistant_choices = None 164 with subtest('response contains at least one "assistant" message'): 165 assistant_choices = [c for c in response['choices'] if c['message']['role'] == 'assistant'] 166 assert len(assistant_choices) >= 1 167 168 with subtest('first message (lowercase) starts with "hello"'): 169 assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello' 170 171 with subtest('check `/running` for just smollm2'): 172 running_response = get_json('/running') 173 assert len(running_response['running']) == 1 174 running_model = running_response['running'][0] 175 assert running_model['model'] == 'smollm2' 176 assert running_model['state'] == 'ready' 177 178 with subtest('check `/running` for smollm2 to timeout'): 179 machine.succeed('curl --silent --fail http://localhost:8080/running | grep "smollm2"') 180 machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep -v "smollm2"', timeout=11) 181 running_response = get_json('/running') 182 assert len(running_response['running']) == 0 183 184 with subtest('runs smollm2-group-1 and smollm2-group-2'): 185 response_1 = None 186 with subtest('send request to smollm2-group-1'): 187 data = { 188 'model': 'smollm2-group-1', 189 'messages': [ 190 { 191 'role': 'user', 192 'content': 'Say hello' 193 } 194 ] 195 } 196 response_1 = post_json('/v1/chat/completions', data) 197 198 with subtest('response 1 is from smollm2-group-1'): 199 assert response_1['model'] == 'smollm2-group-1' 200 201 with subtest('response 1 contains at least one item in "choices"'): 202 assert len(response['choices']) >= 1 203 204 assistant_choices_1 = None 205 with subtest('response 1 contains at least one "assistant" message'): 206 assistant_choices_1 = [c for c in response_1['choices'] if c['message']['role'] == 'assistant'] 207 assert len(assistant_choices_1) >= 1 208 209 with subtest('first message (lowercase) in response 1 starts with "hello"'): 210 assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello' 211 212 with subtest('check `/running` for just smollm2-group-1'): 213 running_response = get_json('/running') 214 assert len(running_response['running']) == 1 215 running_model = running_response['running'][0] 216 assert running_model['model'] == 'smollm2-group-1' 217 assert running_model['state'] == 'ready' 218 219 response_2 = None 220 with subtest('send request to smollm2-group-2'): 221 data = { 222 'model': 'smollm2-group-2', 223 'messages': [ 224 { 225 'role': 'user', 226 'content': 'Say hello' 227 } 228 ] 229 } 230 response_2 = post_json('/v1/chat/completions', data) 231 232 with subtest('response 2 is from smollm2-group-2'): 233 assert response_2['model'] == 'smollm2-group-2' 234 235 with subtest('response 2 contains at least one item in "choices"'): 236 assert len(response['choices']) >= 1 237 238 assistant_choices_2 = None 239 with subtest('response 2 contains at least one "assistant" message'): 240 assistant_choices_2 = [c for c in response_2['choices'] if c['message']['role'] == 'assistant'] 241 assert len(assistant_choices_2) >= 1 242 243 with subtest('first message (lowercase) in response 1 starts with "hello"'): 244 assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello' 245 246 with subtest('check `/running` for both smollm2-group-1 and smollm2-group-2'): 247 running_response = get_json('/running')['running'] 248 assert len(running_response) == 2 249 assert len([ 250 rm for rm in running_response 251 if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-1' 252 ]) == 1 253 assert len([ 254 rm for rm in running_response 255 if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-2' 256 ]) == 1 257 ''; 258}