at master 2.8 kB view raw
1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a 2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always 3# requires the CUDA toolkit (via nvcc) to be available. 4# 5# This means that if you plan to use flashinfer, you will need to set the 6# environment variable `CUDA_HOME` to `cudatoolkit`. 7{ 8 lib, 9 config, 10 buildPythonPackage, 11 fetchFromGitHub, 12 13 # build-system 14 setuptools, 15 16 # nativeBuildInputs 17 cmake, 18 ninja, 19 cudaPackages, 20 21 # dependencies 22 click, 23 einops, 24 numpy, 25 pynvml, 26 tabulate, 27 torch, 28 tqdm, 29}: 30 31buildPythonPackage rec { 32 pname = "flashinfer"; 33 version = "0.3.1"; 34 pyproject = true; 35 36 src = fetchFromGitHub { 37 owner = "flashinfer-ai"; 38 repo = "flashinfer"; 39 tag = "v${version}"; 40 fetchSubmodules = true; 41 hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs="; 42 }; 43 44 build-system = [ setuptools ]; 45 46 nativeBuildInputs = [ 47 cmake 48 ninja 49 (lib.getBin cudaPackages.cuda_nvcc) 50 ]; 51 52 dontUseCmakeConfigure = true; 53 54 buildInputs = with cudaPackages; [ 55 cuda_cccl 56 cuda_cudart 57 libcublas 58 libcurand 59 ]; 60 61 # FlashInfer offers two installation modes: 62 # 63 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with 64 # compiled kernels cached for future use. JIT mode allows fast installation, 65 # as no CUDA kernels are pre-compiled, making it ideal for development and 66 # testing. JIT version is also available as a sdist in PyPI. 67 # 68 # AOT mode: Core CUDA kernels are pre-compiled and included in the library, 69 # reducing runtime compilation overhead. If a required kernel is not 70 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is 71 # recommended for production environments. 72 # 73 # Here we use opt for the AOT version. 74 preConfigure = '' 75 export FLASHINFER_ENABLE_AOT=1 76 export TORCH_NVCC_FLAGS="--maxrregcount=64" 77 export MAX_JOBS="$NIX_BUILD_CORES" 78 ''; 79 80 FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities; 81 82 pythonRemoveDeps = [ 83 "nvidia-cudnn-frontend" 84 ]; 85 dependencies = [ 86 click 87 einops 88 numpy 89 pynvml 90 tabulate 91 torch 92 tqdm 93 ]; 94 95 meta = { 96 broken = !torch.cudaSupport || !config.cudaSupport; 97 homepage = "https://flashinfer.ai/"; 98 description = "Library and kernel generator for Large Language Models"; 99 longDescription = '' 100 FlashInfer is a library and kernel generator for Large Language Models 101 that provides high-performance implementation of LLM GPU kernels such as 102 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving 103 and inference, and delivers state-of-the-art performance across diverse 104 scenarios. 105 ''; 106 license = lib.licenses.asl20; 107 maintainers = with lib.maintainers; [ breakds ]; 108 }; 109}