pkgs/development/python-modules/torch/source/gcc-14-arm-compat.path at master · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / torch / source / gcc-14-arm-compat.path
at master 2.4 kB view raw
 1diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
 2index 7f05c2ad166..1632b595c4c 100644
 3--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
 4+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
 5@@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
 6   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
 7 };
 8
 9-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
10-    const Vectorized<c10::BFloat16>& a) {
11+#if defined(__GNUC__) && __GNUC__ == 14
12+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
13+__attribute__((optimize("no-tree-vectorize")))
14+#endif
15+inline std::tuple<Vectorized<float>, Vectorized<float>>
16+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
17   static_assert(
18       Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
19   auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
20diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
21index 52d5383e60f..00c9f4eb253 100644
22--- a/aten/src/ATen/native/cpu/Activation.cpp
23+++ b/aten/src/ATen/native/cpu/Activation.cpp
24@@ -26,6 +26,10 @@ namespace at::native {
25
26 namespace {
27
28+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
29+// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
30+__attribute__((optimize("no-tree-vectorize")))
31+#endif
32 static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
33   if (at::isReducedFloatingType(input.scalar_type())) {
34     AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
35diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
36index 8ef0741e77a..8c94decfff0 100644
37--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
38+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
39@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
40
41 /* note: due to write issues, this one cannot be parallelized as well as
42  * unfolded2d_copy */
43+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
44+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
45+__attribute__((optimize("no-tree-vectorize")))
46+#endif
47 void unfolded2d_acc_kernel(
48     ScalarType dtype,
49     void *finput_data,