at master 12 kB view raw
1{ 2 lib, 3 stdenv, 4 python, 5 buildPythonPackage, 6 pythonAtLeast, 7 fetchFromGitHub, 8 fetchpatch, 9 symlinkJoin, 10 autoAddDriverRunpath, 11 12 # build system 13 cmake, 14 jinja2, 15 ninja, 16 packaging, 17 setuptools, 18 setuptools-scm, 19 20 # dependencies 21 which, 22 torch, 23 outlines, 24 psutil, 25 ray, 26 pandas, 27 pyarrow, 28 sentencepiece, 29 numpy, 30 transformers, 31 xformers, 32 xgrammar, 33 numba, 34 fastapi, 35 uvicorn, 36 pydantic, 37 aioprometheus, 38 pynvml, 39 openai, 40 pyzmq, 41 tiktoken, 42 torchaudio, 43 torchvision, 44 py-cpuinfo, 45 lm-format-enforcer, 46 prometheus-fastapi-instrumentator, 47 cupy, 48 cbor2, 49 pybase64, 50 gguf, 51 einops, 52 importlib-metadata, 53 partial-json-parser, 54 compressed-tensors, 55 mistral-common, 56 msgspec, 57 numactl, 58 tokenizers, 59 oneDNN, 60 blake3, 61 depyf, 62 opencv-python-headless, 63 cachetools, 64 llguidance, 65 python-json-logger, 66 python-multipart, 67 llvmPackages, 68 opentelemetry-sdk, 69 opentelemetry-api, 70 opentelemetry-exporter-otlp, 71 bitsandbytes, 72 flashinfer, 73 py-libnuma, 74 setproctitle, 75 openai-harmony, 76 77 # internal dependency - for overriding in overlays 78 vllm-flash-attn ? null, 79 80 cudaSupport ? torch.cudaSupport, 81 cudaPackages ? { }, 82 rocmSupport ? torch.rocmSupport, 83 rocmPackages ? { }, 84 gpuTargets ? [ ], 85}: 86 87let 88 inherit (lib) 89 lists 90 strings 91 trivial 92 ; 93 94 inherit (cudaPackages) flags; 95 96 shouldUsePkg = 97 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; 98 99 # see CMakeLists.txt, grepping for CUTLASS_REVISION 100 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt 101 cutlass = fetchFromGitHub { 102 owner = "NVIDIA"; 103 repo = "cutlass"; 104 tag = "v4.0.0"; 105 hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA="; 106 }; 107 108 flashmla = stdenv.mkDerivation { 109 pname = "flashmla"; 110 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py 111 version = "1.0.0"; 112 113 # grep for GIT_TAG in the following file 114 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake 115 src = fetchFromGitHub { 116 owner = "vllm-project"; 117 repo = "FlashMLA"; 118 rev = "0e43e774597682284358ff2c54530757b654b8d1"; 119 hash = "sha256-wxL/jtq/lsLg1o+4392KNgfw5TYlW6lqEVbmR3Jl4/Q="; 120 }; 121 122 dontConfigure = true; 123 124 # flashmla normally relies on `git submodule update` to fetch cutlass 125 buildPhase = '' 126 rm -rf csrc/cutlass 127 ln -sf ${cutlass} csrc/cutlass 128 ''; 129 130 installPhase = '' 131 cp -rva . $out 132 ''; 133 }; 134 135 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { 136 pname = "vllm-flash-attn"; 137 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 138 version = "2.7.4.post1"; 139 140 # grep for GIT_TAG in the following file 141 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 142 src = fetchFromGitHub { 143 owner = "vllm-project"; 144 repo = "flash-attention"; 145 rev = "57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f"; 146 hash = "sha256-c7L7WZVVEnXMOTPBoSp7jhkl9d4TA4sj11QvOSWTDIE="; 147 }; 148 149 patches = [ 150 # fix Hopper build failure 151 # https://github.com/Dao-AILab/flash-attention/pull/1719 152 # https://github.com/Dao-AILab/flash-attention/pull/1723 153 (fetchpatch { 154 url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch"; 155 hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4="; 156 }) 157 (fetchpatch { 158 url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch"; 159 hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A="; 160 }) 161 ]; 162 163 dontConfigure = true; 164 165 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel 166 buildPhase = '' 167 rm -rf csrc/cutlass 168 ln -sf ${cutlass} csrc/cutlass 169 '' 170 + lib.optionalString (rocmSupport) '' 171 rm -rf csrc/composable_kernel; 172 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel 173 ''; 174 175 installPhase = '' 176 cp -rva . $out 177 ''; 178 }) vllm-flash-attn; 179 180 cpuSupport = !cudaSupport && !rocmSupport; 181 182 # https://github.com/pytorch/pytorch/blob/v2.7.1/torch/utils/cpp_extension.py#L2343-L2345 183 supportedTorchCudaCapabilities = 184 let 185 real = [ 186 "3.5" 187 "3.7" 188 "5.0" 189 "5.2" 190 "5.3" 191 "6.0" 192 "6.1" 193 "6.2" 194 "7.0" 195 "7.2" 196 "7.5" 197 "8.0" 198 "8.6" 199 "8.7" 200 "8.9" 201 "9.0" 202 "9.0a" 203 "10.0" 204 "10.0a" 205 "10.1" 206 "10.1a" 207 "12.0" 208 "12.0a" 209 ]; 210 ptx = lists.map (x: "${x}+PTX") real; 211 in 212 real ++ ptx; 213 214 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements 215 # of the first list *from* the second list. That means: 216 # lists.subtractLists a b = b - a 217 218 # For CUDA 219 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities; 220 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities; 221 222 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild; 223 224 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. 225 gpuArchWarner = 226 supported: unsupported: 227 trivial.throwIf (supported == [ ]) ( 228 "No supported GPU targets specified. Requested GPU targets: " 229 + strings.concatStringsSep ", " unsupported 230 ) supported; 231 232 # Create the gpuTargetString. 233 gpuTargetString = strings.concatStringsSep ";" ( 234 if gpuTargets != [ ] then 235 # If gpuTargets is specified, it always takes priority. 236 gpuTargets 237 else if cudaSupport then 238 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities 239 else if rocmSupport then 240 rocmPackages.clr.gpuTargets 241 else 242 throw "No GPU targets specified" 243 ); 244 245 mergedCudaLibraries = with cudaPackages; [ 246 cuda_cudart # cuda_runtime.h, -lcudart 247 cuda_cccl 248 libcusparse # cusparse.h 249 libcusolver # cusolverDn.h 250 cuda_nvtx 251 cuda_nvrtc 252 # cusparselt # cusparseLt.h 253 libcublas 254 ]; 255 256 # Some packages are not available on all platforms 257 nccl = shouldUsePkg (cudaPackages.nccl or null); 258 259 getAllOutputs = p: [ 260 (lib.getBin p) 261 (lib.getLib p) 262 (lib.getDev p) 263 ]; 264 265in 266 267buildPythonPackage rec { 268 pname = "vllm"; 269 version = "0.10.1.1"; 270 pyproject = true; 271 272 # https://github.com/vllm-project/vllm/issues/12083 273 disabled = pythonAtLeast "3.13"; 274 275 stdenv = torch.stdenv; 276 277 src = fetchFromGitHub { 278 owner = "vllm-project"; 279 repo = "vllm"; 280 tag = "v${version}"; 281 hash = "sha256-lLNjBv5baER0AArX3IV4HWjDZ2jTGXyGIvnHupR8MGM="; 282 }; 283 284 patches = [ 285 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 286 ./0003-propagate-pythonpath.patch 287 ./0005-drop-intel-reqs.patch 288 ]; 289 290 postPatch = '' 291 # pythonRelaxDeps does not cover build-system 292 substituteInPlace pyproject.toml \ 293 --replace-fail "torch ==" "torch >=" \ 294 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools" 295 296 # Ignore the python version check because it hard-codes minor versions and 297 # lags behind `ray`'s python interpreter support 298 substituteInPlace CMakeLists.txt \ 299 --replace-fail \ 300 'set(PYTHON_SUPPORTED_VERSIONS' \ 301 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 302 303 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts 304 substituteInPlace CMakeLists.txt \ 305 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' 306 ''; 307 308 nativeBuildInputs = [ 309 which 310 ] 311 ++ lib.optionals rocmSupport [ 312 rocmPackages.hipcc 313 ] 314 ++ lib.optionals cudaSupport [ 315 cudaPackages.cuda_nvcc 316 autoAddDriverRunpath 317 ] 318 ++ lib.optionals isCudaJetson [ 319 cudaPackages.autoAddCudaCompatRunpath 320 ]; 321 322 build-system = [ 323 cmake 324 jinja2 325 ninja 326 packaging 327 setuptools 328 setuptools-scm 329 torch 330 ]; 331 332 buildInputs = 333 lib.optionals cpuSupport [ 334 oneDNN 335 ] 336 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [ 337 numactl 338 ] 339 ++ lib.optionals cudaSupport ( 340 mergedCudaLibraries 341 ++ (with cudaPackages; [ 342 nccl 343 cudnn 344 libcufile 345 ]) 346 ) 347 ++ lib.optionals rocmSupport ( 348 with rocmPackages; 349 [ 350 clr 351 rocthrust 352 rocprim 353 hipsparse 354 hipblas 355 ] 356 ) 357 ++ lib.optionals stdenv.cc.isClang [ 358 llvmPackages.openmp 359 ]; 360 361 dependencies = [ 362 aioprometheus 363 blake3 364 cachetools 365 cbor2 366 depyf 367 fastapi 368 llguidance 369 lm-format-enforcer 370 numpy 371 openai 372 opencv-python-headless 373 outlines 374 pandas 375 prometheus-fastapi-instrumentator 376 py-cpuinfo 377 pyarrow 378 pybase64 379 pydantic 380 python-json-logger 381 python-multipart 382 pyzmq 383 ray 384 sentencepiece 385 tiktoken 386 tokenizers 387 msgspec 388 gguf 389 einops 390 importlib-metadata 391 partial-json-parser 392 compressed-tensors 393 mistral-common 394 torch 395 torchaudio 396 torchvision 397 transformers 398 uvicorn 399 xformers 400 xgrammar 401 numba 402 opentelemetry-sdk 403 opentelemetry-api 404 opentelemetry-exporter-otlp 405 bitsandbytes 406 setproctitle 407 openai-harmony 408 # vLLM needs Torch's compiler to be present in order to use torch.compile 409 torch.stdenv.cc 410 ] 411 ++ uvicorn.optional-dependencies.standard 412 ++ aioprometheus.optional-dependencies.starlette 413 ++ lib.optionals stdenv.targetPlatform.isLinux [ 414 py-libnuma 415 psutil 416 ] 417 ++ lib.optionals cudaSupport [ 418 cupy 419 pynvml 420 flashinfer 421 ]; 422 423 dontUseCmakeConfigure = true; 424 cmakeFlags = [ 425 ] 426 ++ lib.optionals cudaSupport [ 427 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 428 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 429 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") 430 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") 431 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}") 432 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { 433 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}"; 434 paths = builtins.concatMap getAllOutputs mergedCudaLibraries; 435 }}") 436 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") 437 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") 438 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") 439 ] 440 ++ lib.optionals cpuSupport [ 441 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") 442 ]; 443 444 env = 445 lib.optionalAttrs cudaSupport { 446 VLLM_TARGET_DEVICE = "cuda"; 447 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; 448 } 449 // lib.optionalAttrs rocmSupport { 450 VLLM_TARGET_DEVICE = "rocm"; 451 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. 452 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; 453 ROCM_HOME = "${rocmPackages.clr}"; 454 } 455 // lib.optionalAttrs cpuSupport { 456 VLLM_TARGET_DEVICE = "cpu"; 457 }; 458 459 preConfigure = '' 460 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109 461 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept. 462 export MAX_JOBS="$NIX_BUILD_CORES" 463 ''; 464 465 pythonRelaxDeps = true; 466 467 pythonImportsCheck = [ "vllm" ]; 468 469 passthru = { 470 # make internal dependency available to overlays 471 vllm-flash-attn = vllm-flash-attn'; 472 # updates the cutlass fetcher instead 473 skipBulkUpdate = true; 474 }; 475 476 meta = { 477 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 478 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 479 homepage = "https://github.com/vllm-project/vllm"; 480 license = lib.licenses.asl20; 481 maintainers = with lib.maintainers; [ 482 happysalada 483 lach 484 ]; 485 badPlatforms = [ 486 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa): 487 # find_isa Function invoked with incorrect arguments for function named: 488 # find_isa 489 "x86_64-darwin" 490 ]; 491 }; 492}