1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 pythonAtLeast,
7 fetchFromGitHub,
8 fetchpatch,
9 symlinkJoin,
10 autoAddDriverRunpath,
11
12 # build system
13 cmake,
14 jinja2,
15 ninja,
16 packaging,
17 setuptools,
18 setuptools-scm,
19
20 # dependencies
21 which,
22 torch,
23 outlines,
24 psutil,
25 ray,
26 pandas,
27 pyarrow,
28 sentencepiece,
29 numpy,
30 transformers,
31 xformers,
32 xgrammar,
33 numba,
34 fastapi,
35 uvicorn,
36 pydantic,
37 aioprometheus,
38 pynvml,
39 openai,
40 pyzmq,
41 tiktoken,
42 torchaudio,
43 torchvision,
44 py-cpuinfo,
45 lm-format-enforcer,
46 prometheus-fastapi-instrumentator,
47 cupy,
48 cbor2,
49 pybase64,
50 gguf,
51 einops,
52 importlib-metadata,
53 partial-json-parser,
54 compressed-tensors,
55 mistral-common,
56 msgspec,
57 numactl,
58 tokenizers,
59 oneDNN,
60 blake3,
61 depyf,
62 opencv-python-headless,
63 cachetools,
64 llguidance,
65 python-json-logger,
66 python-multipart,
67 llvmPackages,
68 opentelemetry-sdk,
69 opentelemetry-api,
70 opentelemetry-exporter-otlp,
71 bitsandbytes,
72 flashinfer,
73 py-libnuma,
74 setproctitle,
75 openai-harmony,
76
77 # internal dependency - for overriding in overlays
78 vllm-flash-attn ? null,
79
80 cudaSupport ? torch.cudaSupport,
81 cudaPackages ? { },
82 rocmSupport ? torch.rocmSupport,
83 rocmPackages ? { },
84 gpuTargets ? [ ],
85}:
86
87let
88 inherit (lib)
89 lists
90 strings
91 trivial
92 ;
93
94 inherit (cudaPackages) flags;
95
96 shouldUsePkg =
97 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
98
99 # see CMakeLists.txt, grepping for CUTLASS_REVISION
100 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
101 cutlass = fetchFromGitHub {
102 owner = "NVIDIA";
103 repo = "cutlass";
104 tag = "v4.0.0";
105 hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
106 };
107
108 flashmla = stdenv.mkDerivation {
109 pname = "flashmla";
110 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
111 version = "1.0.0";
112
113 # grep for GIT_TAG in the following file
114 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
115 src = fetchFromGitHub {
116 owner = "vllm-project";
117 repo = "FlashMLA";
118 rev = "0e43e774597682284358ff2c54530757b654b8d1";
119 hash = "sha256-wxL/jtq/lsLg1o+4392KNgfw5TYlW6lqEVbmR3Jl4/Q=";
120 };
121
122 dontConfigure = true;
123
124 # flashmla normally relies on `git submodule update` to fetch cutlass
125 buildPhase = ''
126 rm -rf csrc/cutlass
127 ln -sf ${cutlass} csrc/cutlass
128 '';
129
130 installPhase = ''
131 cp -rva . $out
132 '';
133 };
134
135 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
136 pname = "vllm-flash-attn";
137 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
138 version = "2.7.4.post1";
139
140 # grep for GIT_TAG in the following file
141 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
142 src = fetchFromGitHub {
143 owner = "vllm-project";
144 repo = "flash-attention";
145 rev = "57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f";
146 hash = "sha256-c7L7WZVVEnXMOTPBoSp7jhkl9d4TA4sj11QvOSWTDIE=";
147 };
148
149 patches = [
150 # fix Hopper build failure
151 # https://github.com/Dao-AILab/flash-attention/pull/1719
152 # https://github.com/Dao-AILab/flash-attention/pull/1723
153 (fetchpatch {
154 url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch";
155 hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4=";
156 })
157 (fetchpatch {
158 url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch";
159 hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A=";
160 })
161 ];
162
163 dontConfigure = true;
164
165 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
166 buildPhase = ''
167 rm -rf csrc/cutlass
168 ln -sf ${cutlass} csrc/cutlass
169 ''
170 + lib.optionalString (rocmSupport) ''
171 rm -rf csrc/composable_kernel;
172 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
173 '';
174
175 installPhase = ''
176 cp -rva . $out
177 '';
178 }) vllm-flash-attn;
179
180 cpuSupport = !cudaSupport && !rocmSupport;
181
182 # https://github.com/pytorch/pytorch/blob/v2.7.1/torch/utils/cpp_extension.py#L2343-L2345
183 supportedTorchCudaCapabilities =
184 let
185 real = [
186 "3.5"
187 "3.7"
188 "5.0"
189 "5.2"
190 "5.3"
191 "6.0"
192 "6.1"
193 "6.2"
194 "7.0"
195 "7.2"
196 "7.5"
197 "8.0"
198 "8.6"
199 "8.7"
200 "8.9"
201 "9.0"
202 "9.0a"
203 "10.0"
204 "10.0a"
205 "10.1"
206 "10.1a"
207 "12.0"
208 "12.0a"
209 ];
210 ptx = lists.map (x: "${x}+PTX") real;
211 in
212 real ++ ptx;
213
214 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
215 # of the first list *from* the second list. That means:
216 # lists.subtractLists a b = b - a
217
218 # For CUDA
219 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
220 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
221
222 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
223
224 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
225 gpuArchWarner =
226 supported: unsupported:
227 trivial.throwIf (supported == [ ]) (
228 "No supported GPU targets specified. Requested GPU targets: "
229 + strings.concatStringsSep ", " unsupported
230 ) supported;
231
232 # Create the gpuTargetString.
233 gpuTargetString = strings.concatStringsSep ";" (
234 if gpuTargets != [ ] then
235 # If gpuTargets is specified, it always takes priority.
236 gpuTargets
237 else if cudaSupport then
238 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
239 else if rocmSupport then
240 rocmPackages.clr.gpuTargets
241 else
242 throw "No GPU targets specified"
243 );
244
245 mergedCudaLibraries = with cudaPackages; [
246 cuda_cudart # cuda_runtime.h, -lcudart
247 cuda_cccl
248 libcusparse # cusparse.h
249 libcusolver # cusolverDn.h
250 cuda_nvtx
251 cuda_nvrtc
252 # cusparselt # cusparseLt.h
253 libcublas
254 ];
255
256 # Some packages are not available on all platforms
257 nccl = shouldUsePkg (cudaPackages.nccl or null);
258
259 getAllOutputs = p: [
260 (lib.getBin p)
261 (lib.getLib p)
262 (lib.getDev p)
263 ];
264
265in
266
267buildPythonPackage rec {
268 pname = "vllm";
269 version = "0.10.1.1";
270 pyproject = true;
271
272 # https://github.com/vllm-project/vllm/issues/12083
273 disabled = pythonAtLeast "3.13";
274
275 stdenv = torch.stdenv;
276
277 src = fetchFromGitHub {
278 owner = "vllm-project";
279 repo = "vllm";
280 tag = "v${version}";
281 hash = "sha256-lLNjBv5baER0AArX3IV4HWjDZ2jTGXyGIvnHupR8MGM=";
282 };
283
284 patches = [
285 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
286 ./0003-propagate-pythonpath.patch
287 ./0005-drop-intel-reqs.patch
288 ];
289
290 postPatch = ''
291 # pythonRelaxDeps does not cover build-system
292 substituteInPlace pyproject.toml \
293 --replace-fail "torch ==" "torch >=" \
294 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
295
296 # Ignore the python version check because it hard-codes minor versions and
297 # lags behind `ray`'s python interpreter support
298 substituteInPlace CMakeLists.txt \
299 --replace-fail \
300 'set(PYTHON_SUPPORTED_VERSIONS' \
301 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
302
303 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
304 substituteInPlace CMakeLists.txt \
305 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
306 '';
307
308 nativeBuildInputs = [
309 which
310 ]
311 ++ lib.optionals rocmSupport [
312 rocmPackages.hipcc
313 ]
314 ++ lib.optionals cudaSupport [
315 cudaPackages.cuda_nvcc
316 autoAddDriverRunpath
317 ]
318 ++ lib.optionals isCudaJetson [
319 cudaPackages.autoAddCudaCompatRunpath
320 ];
321
322 build-system = [
323 cmake
324 jinja2
325 ninja
326 packaging
327 setuptools
328 setuptools-scm
329 torch
330 ];
331
332 buildInputs =
333 lib.optionals cpuSupport [
334 oneDNN
335 ]
336 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
337 numactl
338 ]
339 ++ lib.optionals cudaSupport (
340 mergedCudaLibraries
341 ++ (with cudaPackages; [
342 nccl
343 cudnn
344 libcufile
345 ])
346 )
347 ++ lib.optionals rocmSupport (
348 with rocmPackages;
349 [
350 clr
351 rocthrust
352 rocprim
353 hipsparse
354 hipblas
355 ]
356 )
357 ++ lib.optionals stdenv.cc.isClang [
358 llvmPackages.openmp
359 ];
360
361 dependencies = [
362 aioprometheus
363 blake3
364 cachetools
365 cbor2
366 depyf
367 fastapi
368 llguidance
369 lm-format-enforcer
370 numpy
371 openai
372 opencv-python-headless
373 outlines
374 pandas
375 prometheus-fastapi-instrumentator
376 py-cpuinfo
377 pyarrow
378 pybase64
379 pydantic
380 python-json-logger
381 python-multipart
382 pyzmq
383 ray
384 sentencepiece
385 tiktoken
386 tokenizers
387 msgspec
388 gguf
389 einops
390 importlib-metadata
391 partial-json-parser
392 compressed-tensors
393 mistral-common
394 torch
395 torchaudio
396 torchvision
397 transformers
398 uvicorn
399 xformers
400 xgrammar
401 numba
402 opentelemetry-sdk
403 opentelemetry-api
404 opentelemetry-exporter-otlp
405 bitsandbytes
406 setproctitle
407 openai-harmony
408 # vLLM needs Torch's compiler to be present in order to use torch.compile
409 torch.stdenv.cc
410 ]
411 ++ uvicorn.optional-dependencies.standard
412 ++ aioprometheus.optional-dependencies.starlette
413 ++ lib.optionals stdenv.targetPlatform.isLinux [
414 py-libnuma
415 psutil
416 ]
417 ++ lib.optionals cudaSupport [
418 cupy
419 pynvml
420 flashinfer
421 ];
422
423 dontUseCmakeConfigure = true;
424 cmakeFlags = [
425 ]
426 ++ lib.optionals cudaSupport [
427 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
428 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
429 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
430 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
431 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
432 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
433 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
434 paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
435 }}")
436 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
437 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
438 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
439 ]
440 ++ lib.optionals cpuSupport [
441 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
442 ];
443
444 env =
445 lib.optionalAttrs cudaSupport {
446 VLLM_TARGET_DEVICE = "cuda";
447 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
448 }
449 // lib.optionalAttrs rocmSupport {
450 VLLM_TARGET_DEVICE = "rocm";
451 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
452 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
453 ROCM_HOME = "${rocmPackages.clr}";
454 }
455 // lib.optionalAttrs cpuSupport {
456 VLLM_TARGET_DEVICE = "cpu";
457 };
458
459 preConfigure = ''
460 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
461 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
462 export MAX_JOBS="$NIX_BUILD_CORES"
463 '';
464
465 pythonRelaxDeps = true;
466
467 pythonImportsCheck = [ "vllm" ];
468
469 passthru = {
470 # make internal dependency available to overlays
471 vllm-flash-attn = vllm-flash-attn';
472 # updates the cutlass fetcher instead
473 skipBulkUpdate = true;
474 };
475
476 meta = {
477 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
478 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
479 homepage = "https://github.com/vllm-project/vllm";
480 license = lib.licenses.asl20;
481 maintainers = with lib.maintainers; [
482 happysalada
483 lach
484 ];
485 badPlatforms = [
486 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
487 # find_isa Function invoked with incorrect arguments for function named:
488 # find_isa
489 "x86_64-darwin"
490 ];
491 };
492}