pkgs/development/python-modules/triton/default.nix at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / triton / default.nix
at master 9.6 kB view raw
  1{
  2  lib,
  3  addDriverRunpath,
  4  buildPythonPackage,
  5  cmake,
  6  config,
  7  cudaPackages,
  8  fetchFromGitHub,
  9  filelock,
 10  gtest,
 11  libxml2,
 12  lit,
 13  llvm,
 14  ncurses,
 15  ninja,
 16  pybind11,
 17  python,
 18  pytestCheckHook,
 19  writableTmpDirAsHomeHook,
 20  stdenv,
 21  replaceVars,
 22  setuptools,
 23  torchWithRocm,
 24  zlib,
 25  cudaSupport ? config.cudaSupport,
 26  runCommand,
 27  rocmPackages,
 28  triton,
 29}:
 30
 31buildPythonPackage rec {
 32  pname = "triton";
 33  version = "3.4.0";
 34  pyproject = true;
 35
 36  # Remember to bump triton-llvm as well!
 37  src = fetchFromGitHub {
 38    owner = "triton-lang";
 39    repo = "triton";
 40    tag = "v${version}";
 41    hash = "sha256-78s9ke6UV7Tnx3yCr0QZcVDqQELR4XoGgJY7olNJmjk=";
 42  };
 43
 44  patches = [
 45    (replaceVars ./0001-_build-allow-extra-cc-flags.patch {
 46      ccCmdExtraFlags = "-Wl,-rpath,${addDriverRunpath.driverLink}/lib";
 47    })
 48    (replaceVars ./0002-nvidia-driver-short-circuit-before-ldconfig.patch {
 49      libcudaStubsDir =
 50        if cudaSupport then "${lib.getOutput "stubs" cudaPackages.cuda_cudart}/lib/stubs" else null;
 51    })
 52    # Upstream PR: https://github.com/triton-lang/triton/pull/7959
 53    ./0005-amd-search-env-paths.patch
 54  ]
 55  ++ lib.optionals cudaSupport [
 56    (replaceVars ./0003-nvidia-cudart-a-systempath.patch {
 57      cudaToolkitIncludeDirs = "${lib.getInclude cudaPackages.cuda_cudart}/include";
 58    })
 59    (replaceVars ./0004-nvidia-allow-static-ptxas-path.patch {
 60      nixpkgsExtraBinaryPaths = lib.escapeShellArgs [ (lib.getExe' cudaPackages.cuda_nvcc "ptxas") ];
 61    })
 62  ];
 63
 64  postPatch =
 65    # Avoid downloading dependencies remove any downloads
 66    ''
 67      substituteInPlace setup.py \
 68        --replace-fail "[get_json_package_info()]" "[]" \
 69        --replace-fail "[get_llvm_package_info()]" "[]" \
 70        --replace-fail 'yield ("triton.profiler", "third_party/proton/proton")' 'pass' \
 71        --replace-fail "curr_version.group(1) != version" "False"
 72    ''
 73    # Use our `cmakeFlags` instead and avoid downloading dependencies
 74    + ''
 75      substituteInPlace setup.py \
 76        --replace-fail \
 77          "cmake_args.extend(thirdparty_cmake_args)" \
 78          "cmake_args.extend(thirdparty_cmake_args + os.environ.get('cmakeFlags', \"\").split())"
 79    ''
 80    # Don't fetch googletest
 81    + ''
 82      substituteInPlace cmake/AddTritonUnitTest.cmake \
 83        --replace-fail "include(\''${PROJECT_SOURCE_DIR}/unittest/googletest.cmake)" ""\
 84        --replace-fail "include(GoogleTest)" "find_package(GTest REQUIRED)"
 85    ''
 86    # Don't use FHS path for ROCm LLD
 87    # Remove this after `[AMD] Use lld library API #7548` makes it into a release
 88    + ''
 89      substituteInPlace third_party/amd/backend/compiler.py \
 90        --replace-fail 'lld = Path("/opt/rocm/llvm/bin/ld.lld")' \
 91        "import os;lld = Path(os.getenv('HIP_PATH', '/opt/rocm/')"' + "/llvm/bin/ld.lld")'
 92    '';
 93
 94  build-system = [ setuptools ];
 95
 96  nativeBuildInputs = [
 97    cmake
 98    ninja
 99
100    # Note for future:
101    # These *probably* should go in depsTargetTarget
102    # ...but we cannot test cross right now anyway
103    # because we only support cudaPackages on x86_64-linux atm
104    lit
105    llvm
106
107    # Upstream's setup.py tries to write cache somewhere in ~/
108    writableTmpDirAsHomeHook
109  ];
110
111  cmakeFlags = [
112    (lib.cmakeFeature "LLVM_SYSPATH" "${llvm}")
113  ];
114
115  buildInputs = [
116    gtest
117    libxml2.dev
118    ncurses
119    pybind11
120    zlib
121  ];
122
123  dependencies = [
124    filelock
125    # triton uses setuptools at runtime:
126    # https://github.com/NixOS/nixpkgs/pull/286763/#discussion_r1480392652
127    setuptools
128  ];
129
130  NIX_CFLAGS_COMPILE = lib.optionals cudaSupport [
131    # Pybind11 started generating strange errors since python 3.12. Observed only in the CUDA branch.
132    # https://gist.github.com/SomeoneSerge/7d390b2b1313957c378e99ed57168219#file-gistfile0-txt-L1042
133    "-Wno-stringop-overread"
134  ];
135
136  preConfigure =
137    # Ensure that the build process uses the requested number of cores
138    ''
139      export MAX_JOBS="$NIX_BUILD_CORES"
140    '';
141
142  env = {
143    TRITON_BUILD_PROTON = "OFF";
144    TRITON_OFFLINE_BUILD = true;
145  }
146  // lib.optionalAttrs cudaSupport {
147    CC = lib.getExe' cudaPackages.backendStdenv.cc "cc";
148    CXX = lib.getExe' cudaPackages.backendStdenv.cc "c++";
149
150    # TODO: Unused because of how TRITON_OFFLINE_BUILD currently works (subject to change)
151    TRITON_PTXAS_PATH = lib.getExe' cudaPackages.cuda_nvcc "ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
152    TRITON_CUOBJDUMP_PATH = lib.getExe' cudaPackages.cuda_cuobjdump "cuobjdump";
153    TRITON_NVDISASM_PATH = lib.getExe' cudaPackages.cuda_nvdisasm "nvdisasm";
154    TRITON_CUDACRT_PATH = lib.getInclude cudaPackages.cuda_nvcc;
155    TRITON_CUDART_PATH = lib.getInclude cudaPackages.cuda_cudart;
156    TRITON_CUPTI_PATH = cudaPackages.cuda_cupti;
157  };
158
159  pythonRemoveDeps = [
160    # Circular dependency, cf. https://github.com/triton-lang/triton/issues/1374
161    "torch"
162
163    # CLI tools without dist-info
164    "cmake"
165    "lit"
166  ];
167
168  # CMake is run by setup.py instead
169  dontUseCmakeConfigure = true;
170
171  nativeCheckInputs = [ cmake ];
172  preCheck = ''
173    # build/temp* refers to build_ext.build_temp (looked up in the build logs)
174    (cd ./build/temp* ; ctest)
175  '';
176
177  pythonImportsCheck = [
178    "triton"
179    "triton.language"
180  ];
181
182  passthru.gpuCheck = stdenv.mkDerivation {
183    pname = "triton-pytest";
184    inherit (triton) version src;
185
186    requiredSystemFeatures = [ "cuda" ];
187
188    nativeBuildInputs = [
189      (python.withPackages (ps: [
190        ps.scipy
191        ps.torchWithCuda
192        ps.triton-cuda
193      ]))
194    ];
195
196    dontBuild = true;
197    nativeCheckInputs = [
198      pytestCheckHook
199      writableTmpDirAsHomeHook
200    ];
201
202    doCheck = true;
203
204    preCheck = ''
205      cd python/test/unit
206    '';
207    checkPhase = "pytestCheckPhase";
208
209    installPhase = "touch $out";
210  };
211
212  passthru.tests = {
213    # Ultimately, torch is our test suite:
214    inherit torchWithRocm;
215
216    # Test that _get_path_to_hip_runtime_dylib works when ROCm is available at runtime
217    rocm-libamdhip64-path =
218      runCommand "triton-rocm-libamdhip64-path-test"
219        {
220          buildInputs = [
221            triton
222            python
223            rocmPackages.clr
224          ];
225        }
226        ''
227          python -c "
228          import os
229          import triton
230          path = triton.backends.amd.driver._get_path_to_hip_runtime_dylib()
231          print(f'libamdhip64 path: {path}')
232          assert os.path.exists(path)
233          " && touch $out
234        '';
235
236    # Test that path_to_rocm_lld works when ROCm is available at runtime
237    # Remove this after `[AMD] Use lld library API #7548` makes it into a release
238    rocm-lld-path =
239      runCommand "triton-rocm-lld-test"
240        {
241          buildInputs = [
242            triton
243            python
244            rocmPackages.clr
245          ];
246        }
247        ''
248          python -c "
249          import os
250          import triton
251          path = triton.backends.backends['amd'].compiler.path_to_rocm_lld()
252          print(f'ROCm LLD path: {path}')
253          assert os.path.exists(path)
254          " && touch $out
255        '';
256
257    # Test as `nix run -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda`
258    # or, using `programs.nix-required-mounts`, as `nix build -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda.gpuCheck`
259    axpy-cuda =
260      cudaPackages.writeGpuTestPython
261        {
262          libraries = ps: [
263            ps.triton
264            ps.torch-no-triton
265          ];
266        }
267        ''
268          # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html
269
270          import triton
271          import triton.language as tl
272          import torch
273          import os
274
275          @triton.jit
276          def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr):
277            pid = tl.program_id(axis=0)
278            block_start = pid * BLOCK_SIZE
279            offsets = block_start + tl.arange(0, BLOCK_SIZE)
280            mask = offsets < n
281            x = tl.load(x_ptr + offsets, mask=mask)
282            y = tl.load(y_ptr + offsets, mask=mask)
283            output = a * x + y
284            tl.store(out + offsets, output, mask=mask)
285
286          def axpy(a, x, y):
287            output = torch.empty_like(x)
288            assert x.is_cuda and y.is_cuda and output.is_cuda
289            n_elements = output.numel()
290
291            def grid(meta):
292              return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
293
294            axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024)
295            return output
296
297          if __name__ == "__main__":
298            if os.environ.get("HOME", None) == "/homeless-shelter":
299              os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp")
300            if "CC" not in os.environ:
301              os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}"
302            torch.manual_seed(0)
303            size = 12345
304            x = torch.rand(size, device='cuda')
305            y = torch.rand(size, device='cuda')
306            output_torch = 3.14 * x + y
307            output_triton = axpy(3.14, x, y)
308            assert output_torch.sub(output_triton).abs().max().item() < 1e-6
309            print("Triton axpy: OK")
310        '';
311  };
312
313  meta = {
314    description = "Language and compiler for writing highly efficient custom Deep-Learning primitives";
315    homepage = "https://github.com/triton-lang/triton";
316    platforms = lib.platforms.linux;
317    license = lib.licenses.mit;
318    maintainers = with lib.maintainers; [
319      SomeoneSerge
320      derdennisop
321    ];
322  };
323}