1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
3# requires the CUDA toolkit (via nvcc) to be available.
4#
5# This means that if you plan to use flashinfer, you will need to set the
6# environment variable `CUDA_HOME` to `cudatoolkit`.
7{
8 lib,
9 config,
10 buildPythonPackage,
11 fetchFromGitHub,
12
13 # build-system
14 setuptools,
15
16 # nativeBuildInputs
17 cmake,
18 ninja,
19 cudaPackages,
20
21 # dependencies
22 click,
23 einops,
24 numpy,
25 pynvml,
26 tabulate,
27 torch,
28 tqdm,
29}:
30
31buildPythonPackage rec {
32 pname = "flashinfer";
33 version = "0.3.1";
34 pyproject = true;
35
36 src = fetchFromGitHub {
37 owner = "flashinfer-ai";
38 repo = "flashinfer";
39 tag = "v${version}";
40 fetchSubmodules = true;
41 hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs=";
42 };
43
44 build-system = [ setuptools ];
45
46 nativeBuildInputs = [
47 cmake
48 ninja
49 (lib.getBin cudaPackages.cuda_nvcc)
50 ];
51
52 dontUseCmakeConfigure = true;
53
54 buildInputs = with cudaPackages; [
55 cuda_cccl
56 cuda_cudart
57 libcublas
58 libcurand
59 ];
60
61 # FlashInfer offers two installation modes:
62 #
63 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
64 # compiled kernels cached for future use. JIT mode allows fast installation,
65 # as no CUDA kernels are pre-compiled, making it ideal for development and
66 # testing. JIT version is also available as a sdist in PyPI.
67 #
68 # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
69 # reducing runtime compilation overhead. If a required kernel is not
70 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
71 # recommended for production environments.
72 #
73 # Here we use opt for the AOT version.
74 preConfigure = ''
75 export FLASHINFER_ENABLE_AOT=1
76 export TORCH_NVCC_FLAGS="--maxrregcount=64"
77 export MAX_JOBS="$NIX_BUILD_CORES"
78 '';
79
80 FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
81
82 pythonRemoveDeps = [
83 "nvidia-cudnn-frontend"
84 ];
85 dependencies = [
86 click
87 einops
88 numpy
89 pynvml
90 tabulate
91 torch
92 tqdm
93 ];
94
95 meta = {
96 broken = !torch.cudaSupport || !config.cudaSupport;
97 homepage = "https://flashinfer.ai/";
98 description = "Library and kernel generator for Large Language Models";
99 longDescription = ''
100 FlashInfer is a library and kernel generator for Large Language Models
101 that provides high-performance implementation of LLM GPU kernels such as
102 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
103 and inference, and delivers state-of-the-art performance across diverse
104 scenarios.
105 '';
106 license = lib.licenses.asl20;
107 maintainers = with lib.maintainers; [ breakds ];
108 };
109}