pkgs/development/python-modules/vllm/default.nix at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / vllm / default.nix
at master 12 kB view raw
  1{
  2  lib,
  3  stdenv,
  4  python,
  5  buildPythonPackage,
  6  pythonAtLeast,
  7  fetchFromGitHub,
  8  fetchpatch,
  9  symlinkJoin,
 10  autoAddDriverRunpath,
 11
 12  # build system
 13  cmake,
 14  jinja2,
 15  ninja,
 16  packaging,
 17  setuptools,
 18  setuptools-scm,
 19
 20  # dependencies
 21  which,
 22  torch,
 23  outlines,
 24  psutil,
 25  ray,
 26  pandas,
 27  pyarrow,
 28  sentencepiece,
 29  numpy,
 30  transformers,
 31  xformers,
 32  xgrammar,
 33  numba,
 34  fastapi,
 35  uvicorn,
 36  pydantic,
 37  aioprometheus,
 38  pynvml,
 39  openai,
 40  pyzmq,
 41  tiktoken,
 42  torchaudio,
 43  torchvision,
 44  py-cpuinfo,
 45  lm-format-enforcer,
 46  prometheus-fastapi-instrumentator,
 47  cupy,
 48  cbor2,
 49  pybase64,
 50  gguf,
 51  einops,
 52  importlib-metadata,
 53  partial-json-parser,
 54  compressed-tensors,
 55  mistral-common,
 56  msgspec,
 57  numactl,
 58  tokenizers,
 59  oneDNN,
 60  blake3,
 61  depyf,
 62  opencv-python-headless,
 63  cachetools,
 64  llguidance,
 65  python-json-logger,
 66  python-multipart,
 67  llvmPackages,
 68  opentelemetry-sdk,
 69  opentelemetry-api,
 70  opentelemetry-exporter-otlp,
 71  bitsandbytes,
 72  flashinfer,
 73  py-libnuma,
 74  setproctitle,
 75  openai-harmony,
 76
 77  # internal dependency - for overriding in overlays
 78  vllm-flash-attn ? null,
 79
 80  cudaSupport ? torch.cudaSupport,
 81  cudaPackages ? { },
 82  rocmSupport ? torch.rocmSupport,
 83  rocmPackages ? { },
 84  gpuTargets ? [ ],
 85}:
 86
 87let
 88  inherit (lib)
 89    lists
 90    strings
 91    trivial
 92    ;
 93
 94  inherit (cudaPackages) flags;
 95
 96  shouldUsePkg =
 97    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
 98
 99  # see CMakeLists.txt, grepping for CUTLASS_REVISION
100  # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
101  cutlass = fetchFromGitHub {
102    owner = "NVIDIA";
103    repo = "cutlass";
104    tag = "v4.0.0";
105    hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
106  };
107
108  flashmla = stdenv.mkDerivation {
109    pname = "flashmla";
110    # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
111    version = "1.0.0";
112
113    # grep for GIT_TAG in the following file
114    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
115    src = fetchFromGitHub {
116      owner = "vllm-project";
117      repo = "FlashMLA";
118      rev = "0e43e774597682284358ff2c54530757b654b8d1";
119      hash = "sha256-wxL/jtq/lsLg1o+4392KNgfw5TYlW6lqEVbmR3Jl4/Q=";
120    };
121
122    dontConfigure = true;
123
124    # flashmla normally relies on `git submodule update` to fetch cutlass
125    buildPhase = ''
126      rm -rf csrc/cutlass
127      ln -sf ${cutlass} csrc/cutlass
128    '';
129
130    installPhase = ''
131      cp -rva . $out
132    '';
133  };
134
135  vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
136    pname = "vllm-flash-attn";
137    # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
138    version = "2.7.4.post1";
139
140    # grep for GIT_TAG in the following file
141    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
142    src = fetchFromGitHub {
143      owner = "vllm-project";
144      repo = "flash-attention";
145      rev = "57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f";
146      hash = "sha256-c7L7WZVVEnXMOTPBoSp7jhkl9d4TA4sj11QvOSWTDIE=";
147    };
148
149    patches = [
150      # fix Hopper build failure
151      # https://github.com/Dao-AILab/flash-attention/pull/1719
152      # https://github.com/Dao-AILab/flash-attention/pull/1723
153      (fetchpatch {
154        url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch";
155        hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4=";
156      })
157      (fetchpatch {
158        url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch";
159        hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A=";
160      })
161    ];
162
163    dontConfigure = true;
164
165    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
166    buildPhase = ''
167      rm -rf csrc/cutlass
168      ln -sf ${cutlass} csrc/cutlass
169    ''
170    + lib.optionalString (rocmSupport) ''
171      rm -rf csrc/composable_kernel;
172      ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
173    '';
174
175    installPhase = ''
176      cp -rva . $out
177    '';
178  }) vllm-flash-attn;
179
180  cpuSupport = !cudaSupport && !rocmSupport;
181
182  # https://github.com/pytorch/pytorch/blob/v2.7.1/torch/utils/cpp_extension.py#L2343-L2345
183  supportedTorchCudaCapabilities =
184    let
185      real = [
186        "3.5"
187        "3.7"
188        "5.0"
189        "5.2"
190        "5.3"
191        "6.0"
192        "6.1"
193        "6.2"
194        "7.0"
195        "7.2"
196        "7.5"
197        "8.0"
198        "8.6"
199        "8.7"
200        "8.9"
201        "9.0"
202        "9.0a"
203        "10.0"
204        "10.0a"
205        "10.1"
206        "10.1a"
207        "12.0"
208        "12.0a"
209      ];
210      ptx = lists.map (x: "${x}+PTX") real;
211    in
212    real ++ ptx;
213
214  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
215  #   of the first list *from* the second list. That means:
216  #   lists.subtractLists a b = b - a
217
218  # For CUDA
219  supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
220  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
221
222  isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
223
224  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
225  gpuArchWarner =
226    supported: unsupported:
227    trivial.throwIf (supported == [ ]) (
228      "No supported GPU targets specified. Requested GPU targets: "
229      + strings.concatStringsSep ", " unsupported
230    ) supported;
231
232  # Create the gpuTargetString.
233  gpuTargetString = strings.concatStringsSep ";" (
234    if gpuTargets != [ ] then
235      # If gpuTargets is specified, it always takes priority.
236      gpuTargets
237    else if cudaSupport then
238      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
239    else if rocmSupport then
240      rocmPackages.clr.gpuTargets
241    else
242      throw "No GPU targets specified"
243  );
244
245  mergedCudaLibraries = with cudaPackages; [
246    cuda_cudart # cuda_runtime.h, -lcudart
247    cuda_cccl
248    libcusparse # cusparse.h
249    libcusolver # cusolverDn.h
250    cuda_nvtx
251    cuda_nvrtc
252    # cusparselt # cusparseLt.h
253    libcublas
254  ];
255
256  # Some packages are not available on all platforms
257  nccl = shouldUsePkg (cudaPackages.nccl or null);
258
259  getAllOutputs = p: [
260    (lib.getBin p)
261    (lib.getLib p)
262    (lib.getDev p)
263  ];
264
265in
266
267buildPythonPackage rec {
268  pname = "vllm";
269  version = "0.10.1.1";
270  pyproject = true;
271
272  # https://github.com/vllm-project/vllm/issues/12083
273  disabled = pythonAtLeast "3.13";
274
275  stdenv = torch.stdenv;
276
277  src = fetchFromGitHub {
278    owner = "vllm-project";
279    repo = "vllm";
280    tag = "v${version}";
281    hash = "sha256-lLNjBv5baER0AArX3IV4HWjDZ2jTGXyGIvnHupR8MGM=";
282  };
283
284  patches = [
285    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
286    ./0003-propagate-pythonpath.patch
287    ./0005-drop-intel-reqs.patch
288  ];
289
290  postPatch = ''
291    # pythonRelaxDeps does not cover build-system
292    substituteInPlace pyproject.toml \
293      --replace-fail "torch ==" "torch >=" \
294      --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
295
296    # Ignore the python version check because it hard-codes minor versions and
297    # lags behind `ray`'s python interpreter support
298    substituteInPlace CMakeLists.txt \
299      --replace-fail \
300        'set(PYTHON_SUPPORTED_VERSIONS' \
301        'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
302
303    # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
304    substituteInPlace CMakeLists.txt \
305      --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
306  '';
307
308  nativeBuildInputs = [
309    which
310  ]
311  ++ lib.optionals rocmSupport [
312    rocmPackages.hipcc
313  ]
314  ++ lib.optionals cudaSupport [
315    cudaPackages.cuda_nvcc
316    autoAddDriverRunpath
317  ]
318  ++ lib.optionals isCudaJetson [
319    cudaPackages.autoAddCudaCompatRunpath
320  ];
321
322  build-system = [
323    cmake
324    jinja2
325    ninja
326    packaging
327    setuptools
328    setuptools-scm
329    torch
330  ];
331
332  buildInputs =
333    lib.optionals cpuSupport [
334      oneDNN
335    ]
336    ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
337      numactl
338    ]
339    ++ lib.optionals cudaSupport (
340      mergedCudaLibraries
341      ++ (with cudaPackages; [
342        nccl
343        cudnn
344        libcufile
345      ])
346    )
347    ++ lib.optionals rocmSupport (
348      with rocmPackages;
349      [
350        clr
351        rocthrust
352        rocprim
353        hipsparse
354        hipblas
355      ]
356    )
357    ++ lib.optionals stdenv.cc.isClang [
358      llvmPackages.openmp
359    ];
360
361  dependencies = [
362    aioprometheus
363    blake3
364    cachetools
365    cbor2
366    depyf
367    fastapi
368    llguidance
369    lm-format-enforcer
370    numpy
371    openai
372    opencv-python-headless
373    outlines
374    pandas
375    prometheus-fastapi-instrumentator
376    py-cpuinfo
377    pyarrow
378    pybase64
379    pydantic
380    python-json-logger
381    python-multipart
382    pyzmq
383    ray
384    sentencepiece
385    tiktoken
386    tokenizers
387    msgspec
388    gguf
389    einops
390    importlib-metadata
391    partial-json-parser
392    compressed-tensors
393    mistral-common
394    torch
395    torchaudio
396    torchvision
397    transformers
398    uvicorn
399    xformers
400    xgrammar
401    numba
402    opentelemetry-sdk
403    opentelemetry-api
404    opentelemetry-exporter-otlp
405    bitsandbytes
406    setproctitle
407    openai-harmony
408    # vLLM needs Torch's compiler to be present in order to use torch.compile
409    torch.stdenv.cc
410  ]
411  ++ uvicorn.optional-dependencies.standard
412  ++ aioprometheus.optional-dependencies.starlette
413  ++ lib.optionals stdenv.targetPlatform.isLinux [
414    py-libnuma
415    psutil
416  ]
417  ++ lib.optionals cudaSupport [
418    cupy
419    pynvml
420    flashinfer
421  ];
422
423  dontUseCmakeConfigure = true;
424  cmakeFlags = [
425  ]
426  ++ lib.optionals cudaSupport [
427    (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
428    (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
429    (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
430    (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
431    (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
432    (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
433      name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
434      paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
435    }}")
436    (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
437    (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
438    (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
439  ]
440  ++ lib.optionals cpuSupport [
441    (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
442  ];
443
444  env =
445    lib.optionalAttrs cudaSupport {
446      VLLM_TARGET_DEVICE = "cuda";
447      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
448    }
449    // lib.optionalAttrs rocmSupport {
450      VLLM_TARGET_DEVICE = "rocm";
451      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
452      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
453      ROCM_HOME = "${rocmPackages.clr}";
454    }
455    // lib.optionalAttrs cpuSupport {
456      VLLM_TARGET_DEVICE = "cpu";
457    };
458
459  preConfigure = ''
460    # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
461    # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
462    export MAX_JOBS="$NIX_BUILD_CORES"
463  '';
464
465  pythonRelaxDeps = true;
466
467  pythonImportsCheck = [ "vllm" ];
468
469  passthru = {
470    # make internal dependency available to overlays
471    vllm-flash-attn = vllm-flash-attn';
472    # updates the cutlass fetcher instead
473    skipBulkUpdate = true;
474  };
475
476  meta = {
477    description = "High-throughput and memory-efficient inference and serving engine for LLMs";
478    changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
479    homepage = "https://github.com/vllm-project/vllm";
480    license = lib.licenses.asl20;
481    maintainers = with lib.maintainers; [
482      happysalada
483      lach
484    ];
485    badPlatforms = [
486      # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
487      # find_isa Function invoked with incorrect arguments for function named:
488      # find_isa
489      "x86_64-darwin"
490    ];
491  };
492}