pkgs/development/python-modules/flashinfer/default.nix at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / flashinfer / default.nix
at master 2.8 kB view raw
  1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
  2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
  3# requires the CUDA toolkit (via nvcc) to be available.
  4#
  5# This means that if you plan to use flashinfer, you will need to set the
  6# environment variable `CUDA_HOME` to `cudatoolkit`.
  7{
  8  lib,
  9  config,
 10  buildPythonPackage,
 11  fetchFromGitHub,
 12
 13  # build-system
 14  setuptools,
 15
 16  # nativeBuildInputs
 17  cmake,
 18  ninja,
 19  cudaPackages,
 20
 21  # dependencies
 22  click,
 23  einops,
 24  numpy,
 25  pynvml,
 26  tabulate,
 27  torch,
 28  tqdm,
 29}:
 30
 31buildPythonPackage rec {
 32  pname = "flashinfer";
 33  version = "0.3.1";
 34  pyproject = true;
 35
 36  src = fetchFromGitHub {
 37    owner = "flashinfer-ai";
 38    repo = "flashinfer";
 39    tag = "v${version}";
 40    fetchSubmodules = true;
 41    hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs=";
 42  };
 43
 44  build-system = [ setuptools ];
 45
 46  nativeBuildInputs = [
 47    cmake
 48    ninja
 49    (lib.getBin cudaPackages.cuda_nvcc)
 50  ];
 51
 52  dontUseCmakeConfigure = true;
 53
 54  buildInputs = with cudaPackages; [
 55    cuda_cccl
 56    cuda_cudart
 57    libcublas
 58    libcurand
 59  ];
 60
 61  # FlashInfer offers two installation modes:
 62  #
 63  # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
 64  # compiled kernels cached for future use. JIT mode allows fast installation,
 65  # as no CUDA kernels are pre-compiled, making it ideal for development and
 66  # testing. JIT version is also available as a sdist in PyPI.
 67  #
 68  # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
 69  # reducing runtime compilation overhead. If a required kernel is not
 70  # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
 71  # recommended for production environments.
 72  #
 73  # Here we use opt for the AOT version.
 74  preConfigure = ''
 75    export FLASHINFER_ENABLE_AOT=1
 76    export TORCH_NVCC_FLAGS="--maxrregcount=64"
 77    export MAX_JOBS="$NIX_BUILD_CORES"
 78  '';
 79
 80  FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
 81
 82  pythonRemoveDeps = [
 83    "nvidia-cudnn-frontend"
 84  ];
 85  dependencies = [
 86    click
 87    einops
 88    numpy
 89    pynvml
 90    tabulate
 91    torch
 92    tqdm
 93  ];
 94
 95  meta = {
 96    broken = !torch.cudaSupport || !config.cudaSupport;
 97    homepage = "https://flashinfer.ai/";
 98    description = "Library and kernel generator for Large Language Models";
 99    longDescription = ''
100      FlashInfer is a library and kernel generator for Large Language Models
101      that provides high-performance implementation of LLM GPU kernels such as
102      FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
103      and inference, and delivers state-of-the-art performance across diverse
104      scenarios.
105    '';
106    license = lib.licenses.asl20;
107    maintainers = with lib.maintainers; [ breakds ];
108  };
109}