backport fix for AMX register breakage
vmexit's can cause the AMX registers to "misbehave" which can break ZFS, even though ZFS doesn't use AMX at all. This causes crashes and processes hanging forever in uninterruptible sleep (the infamous D state) on Intel Xeon 4th gen HW, possible other HW too, but we only got reports on Sapphire Rapids models. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> Tested-by: Aaron Lauterer <a.lauterer@proxmox.com>
This commit is contained in:
parent
8c6520d1fc
commit
9e8946d4b9
87
debian/patches/0011-Avoid-save-restoring-AMX-registers-to-avoid-a-SPR-er.patch
vendored
Normal file
87
debian/patches/0011-Avoid-save-restoring-AMX-registers-to-avoid-a-SPR-er.patch
vendored
Normal file
@ -0,0 +1,87 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
|
||||
Date: Sat, 26 Aug 2023 14:25:46 -0400
|
||||
Subject: [PATCH] Avoid save/restoring AMX registers to avoid a SPR erratum
|
||||
|
||||
Intel SPR erratum SPR4 says that if you trip into a vmexit while
|
||||
doing FPU save/restore, your AMX register state might misbehave...
|
||||
and by misbehave, I mean save all zeroes incorrectly, leading to
|
||||
explosions if you restore it.
|
||||
|
||||
Since we're not using AMX for anything, the simple way to avoid
|
||||
this is to just not save/restore those when we do anything, since
|
||||
we're killing preemption of any sort across our save/restores.
|
||||
|
||||
If we ever decide to use AMX, it's not clear that we have any
|
||||
way to mitigate this, on Linux...but I am not an expert.
|
||||
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
|
||||
Closes #14989
|
||||
Closes #15168
|
||||
(cherry picked from commit 277f2e587b085d1eb8aa48b4ac0768a9ef5745ab)
|
||||
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
||||
---
|
||||
include/os/linux/kernel/linux/simd_x86.h | 19 ++++++++++++++-----
|
||||
1 file changed, 14 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h
|
||||
index 660f0d42d..455167ac8 100644
|
||||
--- a/include/os/linux/kernel/linux/simd_x86.h
|
||||
+++ b/include/os/linux/kernel/linux/simd_x86.h
|
||||
@@ -157,6 +157,15 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
+#ifndef XFEATURE_MASK_XTILE
|
||||
+/*
|
||||
+ * For kernels where this doesn't exist yet, we still don't want to break
|
||||
+ * by save/restoring this broken nonsense.
|
||||
+ * See issue #14989 or Intel errata SPR4 for why
|
||||
+ */
|
||||
+#define XFEATURE_MASK_XTILE 0x60000
|
||||
+#endif
|
||||
+
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
@@ -319,18 +328,18 @@ kfpu_begin(void)
|
||||
union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()];
|
||||
#if defined(HAVE_XSAVES)
|
||||
if (static_cpu_has(X86_FEATURE_XSAVES)) {
|
||||
- kfpu_do_xsave("xsaves", &state->xsave, ~0);
|
||||
+ kfpu_do_xsave("xsaves", &state->xsave, ~XFEATURE_MASK_XTILE);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_XSAVEOPT)
|
||||
if (static_cpu_has(X86_FEATURE_XSAVEOPT)) {
|
||||
- kfpu_do_xsave("xsaveopt", &state->xsave, ~0);
|
||||
+ kfpu_do_xsave("xsaveopt", &state->xsave, ~XFEATURE_MASK_XTILE);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (static_cpu_has(X86_FEATURE_XSAVE)) {
|
||||
- kfpu_do_xsave("xsave", &state->xsave, ~0);
|
||||
+ kfpu_do_xsave("xsave", &state->xsave, ~XFEATURE_MASK_XTILE);
|
||||
} else if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
kfpu_save_fxsr(&state->fxsave);
|
||||
} else {
|
||||
@@ -415,12 +424,12 @@ kfpu_end(void)
|
||||
union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()];
|
||||
#if defined(HAVE_XSAVES)
|
||||
if (static_cpu_has(X86_FEATURE_XSAVES)) {
|
||||
- kfpu_do_xrstor("xrstors", &state->xsave, ~0);
|
||||
+ kfpu_do_xrstor("xrstors", &state->xsave, ~XFEATURE_MASK_XTILE);
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
if (static_cpu_has(X86_FEATURE_XSAVE)) {
|
||||
- kfpu_do_xrstor("xrstor", &state->xsave, ~0);
|
||||
+ kfpu_do_xrstor("xrstor", &state->xsave, ~XFEATURE_MASK_XTILE);
|
||||
} else if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
kfpu_restore_fxsr(&state->fxsave);
|
||||
} else {
|
||||
--
|
||||
2.39.2
|
||||
|
1
debian/patches/series
vendored
1
debian/patches/series
vendored
@ -8,3 +8,4 @@
|
||||
0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
|
||||
0009-arcstat-Fix-integer-division-with-python3.patch
|
||||
0010-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
|
||||
0011-Avoid-save-restoring-AMX-registers-to-avoid-a-SPR-er.patch
|
||||
|
Loading…
Reference in New Issue
Block a user