add KPTI and related patches

picked from Ubuntu-4.13.0-23.26
This commit is contained in:
Fabian Grünbichler 2018-01-06 15:13:39 +01:00
parent 19894df472
commit 321d628a98
231 changed files with 35084 additions and 14 deletions

View File

@ -1,7 +1,7 @@
From 8e8e48c6f1ec020ff47f50aa49acab6c850cc70e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 12 May 2015 19:29:22 +0100
Subject: [PATCH 01/14] Make mkcompile_h accept an alternate timestamp string
Subject: [PATCH 001/231] Make mkcompile_h accept an alternate timestamp string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From d9166325bf8b4d5a4c7aeb6a15c30c90ffc28347 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
Date: Thu, 14 Sep 2017 11:02:18 +0200
Subject: [PATCH 02/14] bridge: keep MAC of first assigned port
Subject: [PATCH 002/231] bridge: keep MAC of first assigned port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 469fd3d2d05583a20c8210354cf0ad6cbd2360f7 Mon Sep 17 00:00:00 2001
From: Mark Weiman <mark.weiman@markzz.com>
Date: Sat, 29 Jul 2017 09:15:32 -0400
Subject: [PATCH 03/14] pci: Enable overrides for missing ACS capabilities
Subject: [PATCH 003/231] pci: Enable overrides for missing ACS capabilities
(4.12+)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8

View File

@ -1,7 +1,7 @@
From 6003e55f5d4762a819d6691de92d75d29b6c0d58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
Date: Thu, 14 Sep 2017 11:09:58 +0200
Subject: [PATCH 04/14] kvm: disable default dynamic halt polling growth
Subject: [PATCH 004/231] kvm: disable default dynamic halt polling growth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From f33ce8d7dcb3053b513003fb775d6457d30d4921 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Aug 2017 15:33:09 -0400
Subject: [PATCH 05/14] cgroup: Add mount flag to enable cpuset to use v2
Subject: [PATCH 005/231] cgroup: Add mount flag to enable cpuset to use v2
behavior in v1 cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8

View File

@ -1,7 +1,7 @@
From 98df2e6815f8bfb7fb07458a067ddc96e7fe917d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Aug 2017 15:33:10 -0400
Subject: [PATCH 06/14] cpuset: Allow v2 behavior in v1 cgroup
Subject: [PATCH 006/231] cpuset: Allow v2 behavior in v1 cgroup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From b6f813dded8f92cf6df31e1bcad4600b11dd4ae3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Thu, 30 Nov 2017 19:05:45 +0100
Subject: [PATCH 07/14] KVM: x86: fix APIC page invalidation
Subject: [PATCH 007/231] KVM: x86: fix APIC page invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 8ddb7f99e8c2ad80dbe3f9de01e8af5c310ae52d Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:36 -0500
Subject: [PATCH 08/14] vhost: fix skb leak in handle_rx()
Subject: [PATCH 008/231] vhost: fix skb leak in handle_rx()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 442f5963a52060fcf86a73377c31a863738632dd Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:37 -0500
Subject: [PATCH 09/14] tun: free skb in early errors
Subject: [PATCH 009/231] tun: free skb in early errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 3fe5d7c8bcba7d240e74c119c2c4ad1c696f205c Mon Sep 17 00:00:00 2001
From: Wei Xu <wexu@redhat.com>
Date: Fri, 1 Dec 2017 05:10:38 -0500
Subject: [PATCH 10/14] tap: free skb if flags error
Subject: [PATCH 010/231] tap: free skb if flags error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 406a5590ca8c58f0f92927230285a3388e4527e4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 5 Jan 2018 23:51:12 +0100
Subject: [PATCH 11/14] IB/core: Avoid crash on pkey enforcement failed in
Subject: [PATCH 011/231] IB/core: Avoid crash on pkey enforcement failed in
received MADs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8

View File

@ -1,7 +1,7 @@
From 72083c18eb8824dd1d0580c1382d23f4fbc4ed33 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Mon, 20 Nov 2017 16:47:45 -0600
Subject: [PATCH 12/14] IB/core: Don't enforce PKey security on SMI MADs
Subject: [PATCH 012/231] IB/core: Don't enforce PKey security on SMI MADs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -1,7 +1,7 @@
From 0140f5df6cd9e326f3009a16c1b66139b9bb3b45 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 6 Nov 2017 13:31:12 +0100
Subject: [PATCH 13/14] kvm: vmx: Reinstate support for CPUs without virtual
Subject: [PATCH 013/231] kvm: vmx: Reinstate support for CPUs without virtual
NMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8

View File

@ -1,7 +1,7 @@
From a0212ec7cc4bc2f88c4435cca881d21f2b079a80 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 26 Oct 2017 09:13:27 +0200
Subject: [PATCH 14/14] KVM: SVM: obey guest PAT
Subject: [PATCH 014/231] KVM: SVM: obey guest PAT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

View File

@ -0,0 +1,83 @@
From 95e4ae0f4ad738ff6ec8e44ab9fa5529d4369655 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:20 -0700
Subject: [PATCH 015/231] x86/mm: Add the 'nopcid' boot option to turn off PCID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The parameter is only present on x86_64 systems to save a few bytes,
as PCID is always disabled on x86_32.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 0790c9aad84901ca1bdc14746175549c8b5da215)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 62d3a63645c17611fe8ccc0c5adc5e840d9cff7b)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
Documentation/admin-guide/kernel-parameters.txt | 2 ++
arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 73fd6abac39b..3510e255ef4c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2700,6 +2700,8 @@
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
+ nopcid [X86-64] Disable the PCID cpu feature.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..904485e7b230 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
static int __init x86_noinvpcid_setup(char *s)
{
/* noinvpcid doesn't accept parameters */
--
2.14.2

View File

@ -0,0 +1,120 @@
From bbdde34293757490c18c57d8bd9f92e567bbdbcd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:21 -0700
Subject: [PATCH 016/231] x86/mm: Enable CR4.PCIDE on supported systems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
We can use PCID if the CPU has PCID and PGE and we're not on Xen.
By itself, this has no effect. A followup patch will start using PCID.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 7d6bbe5528395f18de50bd2532843546c849883d)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/tlbflush.h | 8 ++++++++
arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++
arch/x86/xen/enlighten_pv.c | 6 ++++++
3 files changed, 36 insertions(+)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 50ea3482e1d1..2b3d68093235 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -207,6 +207,14 @@ static inline void __flush_tlb_all(void)
__flush_tlb_global();
else
__flush_tlb();
+
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+ * we might fail to flush kernel translations for other cached ASIDs.
+ *
+ * To avoid this issue, we force PCID off if PGE is off.
+ */
}
static inline void __flush_tlb_one(unsigned long addr)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 904485e7b230..b95cd94ca97b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE)) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..290bc5ac9852 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -264,6 +264,12 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+
if (!xen_initial_domain())
setup_clear_cpu_cap(X86_FEATURE_ACPI);
--
2.14.2

View File

@ -0,0 +1,54 @@
From 20e07f035810f1b2bb3d816e49f48f6b6a37bf64 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 6 Sep 2017 19:54:54 -0700
Subject: [PATCH 017/231] x86/mm: Document how CR4.PCIDE restore works
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
While debugging a problem, I thought that using
cr4_set_bits_and_update_boot() to restore CR4.PCIDE would be
helpful. It turns out to be counterproductive.
Add a comment documenting how this works.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 0d69e4c4a2db42a9bac6609a3df15bd91163f8b9)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/cpu/common.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b95cd94ca97b..0b80ed14ff52 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
+ /*
+ * We'd like to use cr4_set_bits_and_update_boot(),
+ * but we can't. CR4.PCIDE is special and can only
+ * be set in long mode, and the early CPU init code
+ * doesn't know this and would try to restore CR4.PCIDE
+ * prior to entering long mode.
+ *
+ * Instead, we rely on the fact that hotplug, resume,
+ * etc all fully restore CR4 before they write anything
+ * that could have nonzero PCID bits to CR3. CR4.PCIDE
+ * has no effect on the page tables themselves, so we
+ * don't need it to be restored early.
+ */
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
--
2.14.2

View File

@ -0,0 +1,202 @@
From 2a767692d6140051e569ab59a1440b3760839e03 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 11 Jul 2017 10:33:38 -0500
Subject: [PATCH 018/231] x86/entry/64: Refactor IRQ stacks and make them
NMI-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This will allow IRQ stacks to nest inside NMIs or similar entries
that can happen during IRQ stack setup or teardown.
The new macros won't work correctly if they're invoked with IRQs on.
Add a check under CONFIG_DEBUG_ENTRY to detect that.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
[ Use %r10 instead of %r11 in xen_do_hypervisor_callback to make objtool
and ORC unwinder's lives a little easier. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/b0b2ff5fb97d2da2e1d7e1f380190c92545c8bb5.1499786555.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 1d3e53e8624a3ec85f4041ca6d973da7c1575938)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit be58b042e135d0ee777a54798f33015857d7e2e0)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/process_64.c | 3 ++
arch/x86/Kconfig.debug | 2 --
arch/x86/entry/entry_64.S | 85 +++++++++++++++++++++++++++++++-------------
3 files changed, 64 insertions(+), 26 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index fe56e6f93cbb..1e7701c4cd80 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -404,6 +404,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
+
switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cd20ca0b4043..1fc519f3c49e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
Some of these sanity checks may slow down kernel entries and
exits or otherwise impact performance.
- This is currently used to help test NMI code.
-
If unsure, say N.
config DEBUG_NMI_SELFTEST
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6d078b89a5e8..07b4056af8a8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -447,6 +447,59 @@ ENTRY(irq_entries_start)
.endr
END(irq_entries_start)
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+ pushfq
+ testl $X86_EFLAGS_IF, (%rsp)
+ jz .Lokay_\@
+ ud2
+.Lokay_\@:
+ addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK old_rsp
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ movq %rsp, \old_rsp
+ incl PER_CPU_VAR(irq_count)
+
+ /*
+ * Right now, if we just incremented irq_count to zero, we've
+ * claimed the IRQ stack but we haven't switched to it yet.
+ *
+ * If anything is added that can interrupt us here without using IST,
+ * it must be *extremely* careful to limit its stack usage. This
+ * could include kprobes and a hypothetical future IST-less #DB
+ * handler.
+ */
+
+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ pushq \old_rsp
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ /* We need to be off the IRQ stack before decrementing irq_count. */
+ popq %rsp
+
+ /*
+ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+ * the irq stack but we're not on it.
+ */
+
+ decl PER_CPU_VAR(irq_count)
+.endm
+
/*
* Interrupt entry/exit.
*
@@ -485,17 +538,7 @@ END(irq_entries_start)
CALL_enter_from_user_mode
1:
- /*
- * Save previous stack pointer, optionally switch to interrupt stack.
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
- movq %rsp, %rdi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rdi
+ ENTER_IRQ_STACK old_rsp=%rdi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -515,10 +558,8 @@ common_interrupt:
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
- /* Restore saved previous stack */
- popq %rsp
+ LEAVE_IRQ_STACK
testb $3, CS(%rsp)
jz retint_kernel
@@ -892,12 +933,10 @@ bad_gs:
ENTRY(do_softirq_own_stack)
pushq %rbp
mov %rsp, %rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr), %rsp
- push %rbp /* frame pointer backlink */
+ ENTER_IRQ_STACK old_rsp=%r11
call __do_softirq
+ LEAVE_IRQ_STACK
leaveq
- decl PER_CPU_VAR(irq_count)
ret
END(do_softirq_own_stack)
@@ -924,13 +963,11 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
* see the correct pointer to the pt_regs
*/
movq %rdi, %rsp /* we don't return, adjust the stack frame */
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp, %rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rbp /* frame pointer backlink */
+
+ ENTER_IRQ_STACK old_rsp=%r10
call xen_evtchn_do_upcall
- popq %rsp
- decl PER_CPU_VAR(irq_count)
+ LEAVE_IRQ_STACK
+
#ifndef CONFIG_PREEMPT
call xen_maybe_preempt_hcall
#endif
--
2.14.2

View File

@ -0,0 +1,94 @@
From 63463bcffe420067411ad3d4d01b79c872fffc3a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 11 Jul 2017 10:33:39 -0500
Subject: [PATCH 019/231] x86/entry/64: Initialize the top of the IRQ stack
before switching stacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The OOPS unwinder wants the word at the top of the IRQ stack to
point back to the previous stack at all times when the IRQ stack
is in use. There's currently a one-instruction window in ENTER_IRQ_STACK
during which this isn't the case. Fix it by writing the old RSP to the
top of the IRQ stack before jumping.
This currently writes the pointer to the stack twice, which is a bit
ugly. We could get rid of this by replacing irq_stack_ptr with
irq_stack_ptr_minus_eight (better name welcome). OTOH, there may be
all kinds of odd microarchitectural considerations in play that
affect performance by a few cycles here.
Reported-by: Mike Galbraith <efault@gmx.de>
Reported-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/aae7e79e49914808440ad5310ace138ced2179ca.1499786555.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2995590964da93e1fd9a91550f9c9d9fab28f160)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a753ff654dfd07a7f8d6f39a27126589eac7e55f)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 07b4056af8a8..184b70712545 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -469,6 +469,7 @@ END(irq_entries_start)
DEBUG_ENTRY_ASSERT_IRQS_OFF
movq %rsp, \old_rsp
incl PER_CPU_VAR(irq_count)
+ jnz .Lirq_stack_push_old_rsp_\@
/*
* Right now, if we just incremented irq_count to zero, we've
@@ -478,9 +479,30 @@ END(irq_entries_start)
* it must be *extremely* careful to limit its stack usage. This
* could include kprobes and a hypothetical future IST-less #DB
* handler.
+ *
+ * The OOPS unwinder relies on the word at the top of the IRQ
+ * stack linking back to the previous RSP for the entire time we're
+ * on the IRQ stack. For this to work reliably, we need to write
+ * it before we actually move ourselves to the IRQ stack.
+ */
+
+ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+ movq PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * If the first movq above becomes wrong due to IRQ stack layout
+ * changes, the only way we'll notice is if we try to unwind right
+ * here. Assert that we set up the stack right to catch this type
+ * of bug quickly.
*/
+ cmpq -8(%rsp), \old_rsp
+ je .Lirq_stack_okay\@
+ ud2
+ .Lirq_stack_okay\@:
+#endif
- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+.Lirq_stack_push_old_rsp_\@:
pushq \old_rsp
.endm
--
2.14.2

View File

@ -0,0 +1,463 @@
From 884fcb9e8befe21a962d95664b1e60377284636a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 11 Jul 2017 10:33:44 -0500
Subject: [PATCH 020/231] x86/entry/64: Add unwind hint annotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Add unwind hint annotations to entry_64.S. This will enable the ORC
unwinder to unwind through any location in the entry code including
syscalls, interrupts, and exceptions.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/b9f6d478aadf68ba57c739dcfac34ec0dc021c4c.1499786555.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 8c1f75587a18ca032da8f6376d1ed882d7095289)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a8448e6971c1e71b22c651131d14f8be76e6d399)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/Makefile | 1 -
arch/x86/entry/calling.h | 5 ++++
arch/x86/entry/entry_64.S | 71 ++++++++++++++++++++++++++++++++++++++++-------
3 files changed, 66 insertions(+), 11 deletions(-)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
/*
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %rdx, 12*8+\offset(%rsp)
movq %rsi, 13*8+\offset(%rsp)
movq %rdi, 14*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro SAVE_C_REGS offset=0
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
movq %r12, 3*8+\offset(%rsp)
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
+ UNWIND_HINT_REGS offset=\offset
.endm
.macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
movq 3*8+\offset(%rsp), %r12
movq 4*8+\offset(%rsp), %rbp
movq 5*8+\offset(%rsp), %rbx
+ UNWIND_HINT_REGS offset=\offset extra=0
.endm
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
movq 13*8(%rsp), %rsi
movq 14*8(%rsp), %rdi
+ UNWIND_HINT_IRET_REGS offset=16*8
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 184b70712545..64b233ab7cad 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
+#include <asm/frame.h>
#include <linux/err.h>
.code64
@@ -43,9 +44,10 @@
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
+ UNWIND_HINT_EMPTY
swapgs
sysretq
-ENDPROC(native_usergs_sysret64)
+END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
.macro TRACE_IRQS_IRETQ
@@ -134,6 +136,7 @@ ENDPROC(native_usergs_sysret64)
*/
ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -169,6 +172,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+ UNWIND_HINT_REGS extra=0
/*
* If we need to do entry work or if we guess we'll need to do
@@ -223,6 +227,7 @@ entry_SYSCALL_64_fastpath:
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
1:
@@ -316,6 +321,7 @@ syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
+ UNWIND_HINT_EMPTY
USERGS_SYSRET64
opportunistic_sysret_failed:
@@ -343,6 +349,7 @@ ENTRY(stub_ptregs_64)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
popq %rax
+ UNWIND_HINT_REGS extra=0
jmp entry_SYSCALL64_slow_path
1:
@@ -351,6 +358,7 @@ END(stub_ptregs_64)
.macro ptregs_stub func
ENTRY(ptregs_\func)
+ UNWIND_HINT_FUNC
leaq \func(%rip), %rax
jmp stub_ptregs_64
END(ptregs_\func)
@@ -367,6 +375,7 @@ END(ptregs_\func)
* %rsi: next task
*/
ENTRY(__switch_to_asm)
+ UNWIND_HINT_FUNC
/*
* Save callee-saved registers
* This must match the order in inactive_task_frame
@@ -406,6 +415,7 @@ END(__switch_to_asm)
* r12: kernel thread arg
*/
ENTRY(ret_from_fork)
+ UNWIND_HINT_EMPTY
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -413,6 +423,7 @@ ENTRY(ret_from_fork)
jnz 1f /* kernel threads are uncommon */
2:
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
@@ -440,10 +451,11 @@ END(ret_from_fork)
ENTRY(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
pushq $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
jmp common_interrupt
.align 8
+ vector=vector+1
.endr
END(irq_entries_start)
@@ -465,9 +477,14 @@ END(irq_entries_start)
*
* The invariant is that, if irq_count != -1, then the IRQ stack is in use.
*/
-.macro ENTER_IRQ_STACK old_rsp
+.macro ENTER_IRQ_STACK regs=1 old_rsp
DEBUG_ENTRY_ASSERT_IRQS_OFF
movq %rsp, \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS base=\old_rsp
+ .endif
+
incl PER_CPU_VAR(irq_count)
jnz .Lirq_stack_push_old_rsp_\@
@@ -504,16 +521,24 @@ END(irq_entries_start)
.Lirq_stack_push_old_rsp_\@:
pushq \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS indirect=1
+ .endif
.endm
/*
* Undoes ENTER_IRQ_STACK.
*/
-.macro LEAVE_IRQ_STACK
+.macro LEAVE_IRQ_STACK regs=1
DEBUG_ENTRY_ASSERT_IRQS_OFF
/* We need to be off the IRQ stack before decrementing irq_count. */
popq %rsp
+ .if \regs
+ UNWIND_HINT_REGS
+ .endif
+
/*
* As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
* the irq stack but we're not on it.
@@ -624,6 +649,7 @@ restore_c_regs_and_iret:
INTERRUPT_RETURN
ENTRY(native_iret)
+ UNWIND_HINT_IRET_REGS
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
@@ -696,6 +722,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
/*
* At this point, we cannot write to the stack any more, but we can
@@ -717,6 +744,7 @@ END(common_interrupt)
*/
.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS
ASM_CLAC
pushq $~(\num)
.Lcommon_\sym:
@@ -803,6 +831,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS offset=8
+
/* Sanity check */
.if \shift_ist != -1 && \paranoid == 0
.error "using shift_ist requires paranoid=1"
@@ -826,6 +856,7 @@ ENTRY(\sym)
.else
call error_entry
.endif
+ UNWIND_HINT_REGS
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
.if \paranoid
@@ -923,6 +954,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
* edi: new selector
*/
ENTRY(native_load_gs_index)
+ FRAME_BEGIN
pushfq
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
@@ -931,8 +963,9 @@ ENTRY(native_load_gs_index)
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
SWAPGS
popfq
+ FRAME_END
ret
-END(native_load_gs_index)
+ENDPROC(native_load_gs_index)
EXPORT_SYMBOL(native_load_gs_index)
_ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -955,12 +988,12 @@ bad_gs:
ENTRY(do_softirq_own_stack)
pushq %rbp
mov %rsp, %rbp
- ENTER_IRQ_STACK old_rsp=%r11
+ ENTER_IRQ_STACK regs=0 old_rsp=%r11
call __do_softirq
- LEAVE_IRQ_STACK
+ LEAVE_IRQ_STACK regs=0
leaveq
ret
-END(do_softirq_own_stack)
+ENDPROC(do_softirq_own_stack)
#ifdef CONFIG_XEN
idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
@@ -984,7 +1017,9 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
* see the correct pointer to the pt_regs
*/
+ UNWIND_HINT_FUNC
movq %rdi, %rsp /* we don't return, adjust the stack frame */
+ UNWIND_HINT_REGS
ENTER_IRQ_STACK old_rsp=%r10
call xen_evtchn_do_upcall
@@ -1010,6 +1045,7 @@ END(xen_do_hypervisor_callback)
* with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
+ UNWIND_HINT_EMPTY
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
jne 1f
@@ -1029,11 +1065,13 @@ ENTRY(xen_failsafe_callback)
pushq $0 /* RIP */
pushq %r11
pushq %rcx
+ UNWIND_HINT_IRET_REGS offset=8
jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
addq $0x30, %rsp
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
@@ -1079,6 +1117,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1106,6 +1145,7 @@ END(paranoid_entry)
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
@@ -1127,6 +1167,7 @@ END(paranoid_exit)
* Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
+ UNWIND_HINT_FUNC
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1211,6 +1252,7 @@ END(error_entry)
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
ENTRY(error_exit)
+ UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
testl %ebx, %ebx
@@ -1220,6 +1262,7 @@ END(error_exit)
/* Runs on exception stack */
ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
/*
* Fix up the exception frame if we're on Xen.
* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
@@ -1293,11 +1336,13 @@ ENTRY(nmi)
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
pushq 3*8(%rdx) /* pt_regs->flags */
pushq 2*8(%rdx) /* pt_regs->cs */
pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
pushq $-1 /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -1314,6 +1359,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
ENCODE_FRAME_POINTER
/*
@@ -1468,6 +1514,7 @@ first_nmi:
.rept 5
pushq 11*8(%rsp)
.endr
+ UNWIND_HINT_IRET_REGS
/* Everything up to here is safe from nested NMIs */
@@ -1483,6 +1530,7 @@ first_nmi:
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
INTERRUPT_RETURN /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
1:
#endif
@@ -1532,6 +1580,7 @@ end_repeat_nmi:
* exceptions might do.
*/
call paranoid_entry
+ UNWIND_HINT_REGS
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
@@ -1569,17 +1618,19 @@ nmi_restore:
END(nmi)
ENTRY(ignore_sysret)
+ UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
ENTRY(rewind_stack_do_exit)
+ UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+ leaq -PTREGS_SIZE(%rax), %rsp
+ UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
call do_exit
-1: jmp 1b
END(rewind_stack_do_exit)
--
2.14.2

View File

@ -0,0 +1,70 @@
From aa2a95a84f2cbd92b10887f3c99c7858fae9e7e4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:29 -0500
Subject: [PATCH 021/231] xen/x86: Remove SME feature in PV guests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Xen does not currently support SME for PV guests. Clear the SME CPU
capability in order to avoid any ambiguity.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: <xen-devel@lists.xen.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/3b605622a9fae5e588e5a13967120a18ec18071b.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit f2f931c6819467af5260a21c59fb787ce2863f92)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 8370907399392a637a2e51b4db3368fb594db3a6)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/enlighten_pv.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 290bc5ac9852..df1921751aa5 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,7 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_MTRR);
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_SME);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
--
2.14.2

View File

@ -0,0 +1,152 @@
From c63a9850ba744d9871b4ca2dad11588db5d670a2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 7 Aug 2017 20:59:21 -0700
Subject: [PATCH 022/231] x86/xen/64: Rearrange the SYSCALL entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Xen's raw SYSCALL entries are much less weird than native. Rather
than fudging them to look like native entries, use the Xen-provided
stack frame directly.
This lets us eliminate entry_SYSCALL_64_after_swapgs and two uses of
the SWAPGS_UNSAFE_STACK paravirt hook. The SYSENTER code would
benefit from similar treatment.
This makes one change to the native code path: the compat
instruction that clears the high 32 bits of %rax is moved slightly
later. I'd be surprised if this affects performance at all.
Tested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/7c88ed36805d36841ab03ec3b48b4122c4418d71.1502164668.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 8a9949bc71a71b3dd633255ebe8f8869b1f73474)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b8cec41ee5f30df5032cfe8c86103f7d92a89590)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 9 ++-------
arch/x86/entry/entry_64_compat.S | 7 +++----
arch/x86/xen/xen-asm_64.S | 23 +++++++++--------------
3 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 64b233ab7cad..4dbb336a1fdd 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64)
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
+ swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..5314d7b8e5ad 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
*/
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ swapgs
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..a8a4f4c460a6 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -82,34 +82,29 @@ RELOC(xen_sysret64, 1b+1)
* rip
* r11
* rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
-.macro undo_xen_syscall
- mov 0*8(%rsp), %rcx
- mov 1*8(%rsp), %r11
- mov 5*8(%rsp), %rsp
-.endm
-
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
- undo_xen_syscall
- jmp entry_SYSCALL_64_after_swapgs
+ popq %rcx
+ popq %r11
+ jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
- undo_xen_syscall
- jmp entry_SYSCALL_compat
+ popq %rcx
+ popq %r11
+ jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
- undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
--
2.14.2

View File

@ -0,0 +1,223 @@
From 050fcd1a748bd2f17b540d0147c8a4f3067653ee Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 3 Aug 2017 11:38:21 +0900
Subject: [PATCH 023/231] irq: Make the irqentry text section unconditional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Generate irqentry and softirqentry text sections without
any Kconfig dependencies. This will add extra sections, but
there should be no performace impact.
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S . Miller <davem@davemloft.net>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-arch@vger.kernel.org
Cc: linux-cris-kernel@axis.com
Cc: mathieu.desnoyers@efficios.com
Link: http://lkml.kernel.org/r/150172789110.27216.3955739126693102122.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 229a71860547ec856b156179a9c6bef2de426f66)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 8fd2f68cc93ae772cfddf4151d13448ff17d0229)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/arm/include/asm/traps.h | 7 -------
arch/arm64/include/asm/traps.h | 7 -------
include/asm-generic/sections.h | 4 ++++
include/asm-generic/vmlinux.lds.h | 8 --------
include/linux/interrupt.h | 14 +-------------
arch/x86/kernel/unwind_frame.c | 2 --
arch/x86/entry/entry_64.S | 9 ++-------
7 files changed, 7 insertions(+), 44 deletions(-)
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
void register_undef_hook(struct undef_hook *hook);
void unregister_undef_hook(struct undef_hook *hook);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 532372c6cf15..e5da44eddd2f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -27,6 +27,8 @@
* __kprobes_text_start, __kprobes_text_end
* __entry_text_start, __entry_text_end
* __ctors_start, __ctors_end
+ * __irqentry_text_start, __irqentry_text_end
+ * __softirqentry_text_start, __softirqentry_text_end
*/
extern char _text[], _stext[], _etext[];
extern char _data[], _sdata[], _edata[];
@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __entry_text_start[], __entry_text_end[];
extern char __start_rodata[], __end_rodata[];
+extern char __irqentry_text_start[], __irqentry_text_end[];
+extern char __softirqentry_text_start[], __softirqentry_text_end[];
/* Start and end of .ctors section - used for constructor calls. */
extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9623d78f8494..e7e955d4ab9e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -497,25 +497,17 @@
*(.entry.text) \
VMLINUX_SYMBOL(__entry_text_end) = .;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
#define IRQENTRY_TEXT \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__irqentry_text_start) = .; \
*(.irqentry.text) \
VMLINUX_SYMBOL(__irqentry_text_end) = .;
-#else
-#define IRQENTRY_TEXT
-#endif
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
#define SOFTIRQENTRY_TEXT \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__softirqentry_text_start) = .; \
*(.softirqentry.text) \
VMLINUX_SYMBOL(__softirqentry_text_end) = .;
-#else
-#define SOFTIRQENTRY_TEXT
-#endif
/* Section used for early init (in .S files) */
#define HEAD_TEXT *(.head.text)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a2fddddb0d60..59ba11661b6e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
#include <linux/atomic.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
+#include <asm/sections.h>
/*
* These correspond to the IORESOURCE_IRQ_* defines in
@@ -726,7 +727,6 @@ extern int early_irq_init(void);
extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
/*
* We want to know which function is an entrypoint of a hardirq or a softirq.
*/
@@ -734,16 +734,4 @@ extern int arch_early_irq_init(void);
#define __softirq_entry \
__attribute__((__section__(".softirqentry.text")))
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-/* Limits of softirq entrypoints */
-extern char __softirqentry_text_start[];
-extern char __softirqentry_text_end[];
-
-#else
-#define __irq_entry
-#define __softirq_entry
-#endif
-
#endif
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..c29e5bc7e9c9 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -91,10 +91,8 @@ static bool in_entry_code(unsigned long ip)
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
-#endif
return false;
}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4dbb336a1fdd..ca0b250eefc4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -761,13 +761,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
#endif
/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY .popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
.macro apicinterrupt num sym do_sym
PUSH_SECTION_IRQENTRY
--
2.14.2

View File

@ -0,0 +1,84 @@
From 2b0794bbebac81a539dfd405273d61a8a16531d2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 14 Aug 2017 22:36:19 -0700
Subject: [PATCH 024/231] x86/xen/64: Fix the reported SS and CS in SYSCALL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
When I cleaned up the Xen SYSCALL entries, I inadvertently changed
the reported segment registers. Before my patch, regs->ss was
__USER(32)_DS and regs->cs was __USER(32)_CS. After the patch, they
are FLAT_USER_CS/DS(32).
This had a couple unfortunate effects. It confused the
opportunistic fast return logic. It also significantly increased
the risk of triggering a nasty glibc bug:
https://sourceware.org/bugzilla/show_bug.cgi?id=21269
Update the Xen entry code to change it back.
Reported-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: xen-devel@lists.xenproject.org
Fixes: 8a9949bc71a7 ("x86/xen/64: Rearrange the SYSCALL entries")
Link: http://lkml.kernel.org/r/daba8351ea2764bb30272296ab9ce08a81bd8264.1502775273.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit fa2016a8e7d846b306e431646d250500e1da0c33)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 69a6ef3aeb274efe86fd74771830354f303ccc2f)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/xen-asm_64.S | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index a8a4f4c460a6..c5fee2680abc 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -88,6 +88,15 @@ RELOC(xen_sysret64, 1b+1)
ENTRY(xen_syscall_target)
popq %rcx
popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
@@ -97,6 +106,15 @@ ENDPROC(xen_syscall_target)
ENTRY(xen_syscall32_target)
popq %rcx
popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
--
2.14.2

View File

@ -0,0 +1,360 @@
From e61e24c7ee0d773230646650659c34ffc5316520 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 16 Aug 2017 19:31:56 +0200
Subject: [PATCH 025/231] x86/paravirt/xen: Remove xen_patch()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Xen's paravirt patch function xen_patch() does some special casing for
irq_ops functions to apply relocations when those functions can be
patched inline instead of calls.
Unfortunately none of the special case function replacements is small
enough to be patched inline, so the special case never applies.
As xen_patch() will call paravirt_patch_default() in all cases it can
be just dropped. xen-asm.h doesn't seem necessary without xen_patch()
as the only thing left in it would be the definition of XEN_EFLAGS_NMI
used only once. So move that definition and remove xen-asm.h.
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: rusty@rustcorp.com.au
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170816173157.8633-2-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit edcb5cf84f05e5d2e2af25422a72ccde359fcca9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit c96c9c712136a9e24a7aaf0aac4c149eee01bd8e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/xen-asm.h | 12 ---------
arch/x86/xen/xen-ops.h | 15 +++---------
arch/x86/xen/enlighten_pv.c | 59 +--------------------------------------------
arch/x86/xen/xen-asm.S | 26 +++++---------------
arch/x86/xen/xen-asm_32.S | 27 ++++-----------------
arch/x86/xen/xen-asm_64.S | 20 ++++-----------
6 files changed, 21 insertions(+), 138 deletions(-)
delete mode 100644 arch/x86/xen/xen-asm.h
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
deleted file mode 100644
index 465276467a47..000000000000
--- a/arch/x86/xen/xen-asm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _XEN_XEN_ASM_H
-#define _XEN_XEN_ASM_H
-
-#include <linux/linkage.h>
-
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..70301ac0d414 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void)
}
#endif
-/* Declare an asm function, along with symbols needed to make it
- inlineable */
-#define DECL_ASM(ret, name, ...) \
- __visible ret name(__VA_ARGS__); \
- extern char name##_end[] __visible; \
- extern char name##_reloc[] __visible
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index df1921751aa5..6c279c8f0a0e 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -988,59 +988,6 @@ void __ref xen_setup_vcpu_info_placement(void)
}
}
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
-{
- char *start, *end, *reloc;
- unsigned ret;
-
- start = end = reloc = NULL;
-
-#define SITE(op, x) \
- case PARAVIRT_PATCH(op.x): \
- if (xen_have_vcpu_info_placement) { \
- start = (char *)xen_##x##_direct; \
- end = xen_##x##_direct_end; \
- reloc = xen_##x##_direct_reloc; \
- } \
- goto patch_site
-
- switch (type) {
- SITE(pv_irq_ops, irq_enable);
- SITE(pv_irq_ops, irq_disable);
- SITE(pv_irq_ops, save_fl);
- SITE(pv_irq_ops, restore_fl);
-#undef SITE
-
- patch_site:
- if (start == NULL || (end-start) > len)
- goto default_patch;
-
- ret = paravirt_patch_insns(insnbuf, len, start, end);
-
- /* Note: because reloc is assigned from something that
- appears to be an array, gcc assumes it's non-null,
- but doesn't know its relationship with start and
- end. */
- if (reloc > start && reloc < end) {
- int reloc_off = reloc - start;
- long *relocp = (long *)(insnbuf + reloc_off);
- long delta = start - (char *)addr;
-
- *relocp += delta;
- }
- break;
-
- default_patch:
- default:
- ret = paravirt_patch_default(type, clobbers, insnbuf,
- addr, len);
- break;
- }
-
- return ret;
-}
-
static const struct pv_info xen_info __initconst = {
.shared_kernel_pmd = 0,
@@ -1050,10 +997,6 @@ static const struct pv_info xen_info __initconst = {
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initconst = {
- .patch = xen_patch,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
@@ -1251,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_init_ops = xen_init_ops;
+ pv_init_ops.patch = paravirt_patch_default;
pv_cpu_ops = xen_cpu_ops;
x86_platform.get_nmi_reason = xen_get_nmi_reason;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index eff224df813f..dcd31fa39b5d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/asm-offsets.h>
@@ -16,7 +10,7 @@
#include <asm/processor-flags.h>
#include <asm/frame.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
/*
* Enable events. This clears the event mask and tests the pending
@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_irq_enable_direct)
FRAME_END
ret
ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
/*
@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
+ENDPROC(xen_irq_disable_direct)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
-ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
/*
@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
-2: call check_events
+ call check_events
1:
-ENDPATCH(xen_restore_fl_direct)
FRAME_END
ret
ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
/*
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index feb6d40a0860..1200e262a116 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/thread_info.h>
@@ -18,21 +12,10 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
/*
* This is run where a normal iret would be run, with the same stack setup:
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c5fee2680abc..3a3b6a211584 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,8 @@
/*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining. The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
*
* We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
*/
#include <asm/errno.h>
@@ -20,7 +14,7 @@
#include <xen/interface/xen.h>
-#include "xen-asm.h"
+#include <linux/linkage.h>
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
@@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
*/
ENTRY(xen_iret)
pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_iret)
-RELOC(xen_iret, 1b+1)
+ jmp hypercall_iret
ENTRY(xen_sysret64)
/*
@@ -65,9 +57,7 @@ ENTRY(xen_sysret64)
pushq %rcx
pushq $VGCF_in_syscall
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret64)
-RELOC(xen_sysret64, 1b+1)
+ jmp hypercall_iret
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
--
2.14.2

View File

@ -0,0 +1,218 @@
From e61177a6feca143d431be190d4758bda23f6174d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Aug 2017 08:47:22 +0200
Subject: [PATCH 026/231] x86/traps: Simplify pagefault tracing logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Make use of the new irqvector tracing static key and remove the duplicated
trace_do_pagefault() implementation.
If irq vector tracing is disabled, then the overhead of this is a single
NOP5, which is a reasonable tradeoff to avoid duplicated code and the
unholy macro mess.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170828064956.672965407@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 8478bb5608747fd64c9fd4a2f5422fb4af756a50)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/traps.h | 10 +--------
arch/x86/kernel/kvm.c | 2 +-
arch/x86/mm/fault.c | 49 ++++++++++++--------------------------------
arch/x86/entry/entry_32.S | 8 --------
arch/x86/entry/entry_64.S | 13 +-----------
5 files changed, 16 insertions(+), 66 deletions(-)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7f48cd..b4f322d6c95f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -39,7 +39,6 @@ asmlinkage void machine_check(void);
asmlinkage void simd_coprocessor_error(void);
#ifdef CONFIG_TRACING
-asmlinkage void trace_page_fault(void);
#define trace_stack_segment stack_segment
#define trace_divide_error divide_error
#define trace_bounds bounds
@@ -54,6 +53,7 @@ asmlinkage void trace_page_fault(void);
#define trace_alignment_check alignment_check
#define trace_simd_coprocessor_error simd_coprocessor_error
#define trace_async_page_fault async_page_fault
+#define trace_page_fault page_fault
#endif
dotraplinkage void do_divide_error(struct pt_regs *, long);
@@ -74,14 +74,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
-#ifdef CONFIG_TRACING
-dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
-#else
-static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error)
-{
- do_page_fault(regs, error);
-}
-#endif
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e5e4306e4546..9e3798b00e40 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -270,7 +270,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
switch (kvm_read_and_reset_pf_reason()) {
default:
- trace_do_page_fault(regs, error_code);
+ do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 955be01dd9cc..4ee9eb916826 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1253,10 +1253,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
- *
- * This function must have noinline because both callers
- * {,trace_}do_page_fault() have notrace on. Having this an actual function
- * guarantees there's a function trace entry.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
@@ -1491,27 +1487,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
}
NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
-{
- unsigned long address = read_cr2(); /* Get the faulting address */
- enum ctx_state prev_state;
-
- /*
- * We must have this function tagged with __kprobes, notrace and call
- * read_cr2() before calling anything else. To avoid calling any kind
- * of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contain all sorts of tracepoints.
- */
-
- prev_state = exception_enter();
- __do_page_fault(regs, error_code, address);
- exception_exit(prev_state);
-}
-NOKPROBE_SYMBOL(do_page_fault);
-
-#ifdef CONFIG_TRACING
static nokprobe_inline void
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
@@ -1522,22 +1497,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
trace_page_fault_kernel(address, regs, error_code);
}
+/*
+ * We must have this function blacklisted from kprobes, tagged with notrace
+ * and call read_cr2() before calling anything else. To avoid calling any
+ * kind of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contains all sorts of tracepoints.
+ */
dotraplinkage void notrace
-trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
- /*
- * The exception_enter and tracepoint processing could
- * trigger another page faults (user space callchain
- * reading) and destroy the original cr2 value, so read
- * the faulting address now.
- */
- unsigned long address = read_cr2();
+ unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
prev_state = exception_enter();
- trace_page_fault_entries(address, regs, error_code);
+ if (trace_irqvectors_enabled())
+ trace_page_fault_entries(address, regs, error_code);
+
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
-NOKPROBE_SYMBOL(trace_do_page_fault);
-#endif /* CONFIG_TRACING */
+NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 48ef7bb32c42..0092da1c056f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -891,14 +891,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
#endif /* CONFIG_HYPERV */
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
- ASM_CLAC
- pushl $trace_do_page_fault
- jmp common_exception
-END(trace_page_fault)
-#endif
-
ENTRY(page_fault)
ASM_CLAC
pushl $do_page_fault
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ca0b250eefc4..dfabcbf8e813 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -913,17 +913,6 @@ ENTRY(\sym)
END(\sym)
.endm
-#ifdef CONFIG_TRACING
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#else
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#endif
-
idtentry divide_error do_divide_error has_error_code=0
idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
@@ -1091,7 +1080,7 @@ idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
idtentry async_page_fault do_async_page_fault has_error_code=1
--
2.14.2

View File

@ -0,0 +1,263 @@
From 12f71c3ef98c53a158abec93ef40cd15c9120284 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Aug 2017 08:47:37 +0200
Subject: [PATCH 027/231] x86/idt: Unify gate_struct handling for 32/64-bit
kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The first 32 bits of gate struct are the same for 32 and 64 bit kernels.
The 32-bit version uses desc_struct and no designated data structure,
so we need different accessors for 32 and 64 bit kernels.
Aside of that the macros which are necessary to build the 32-bit
gate descriptor are horrible to read.
Unify the gate structs and switch all code fiddling with it over.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170828064957.861974317@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 64b163fab684e3de47aa8db6cc08ae7d2e194373)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 587719b1926757eb7531e0631d63fb93cd60d0d3)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/desc.h | 45 ++++++++++++++-----------------
arch/x86/include/asm/desc_defs.h | 57 ++++++++++++++++++++++++++--------------
arch/x86/kvm/vmx.c | 2 +-
arch/x86/xen/enlighten_pv.c | 12 ++++-----
4 files changed, 63 insertions(+), 53 deletions(-)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..57e502a4e92f 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -83,33 +83,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}
-#ifdef CONFIG_X86_64
-
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
- gate->offset_low = PTR_LOW(func);
+ gate->offset_low = (u16) func;
+ gate->bits.p = 1;
+ gate->bits.dpl = dpl;
+ gate->bits.zero = 0;
+ gate->bits.type = type;
+ gate->offset_middle = (u16) (func >> 16);
+#ifdef CONFIG_X86_64
gate->segment = __KERNEL_CS;
- gate->ist = ist;
- gate->p = 1;
- gate->dpl = dpl;
- gate->zero0 = 0;
- gate->zero1 = 0;
- gate->type = type;
- gate->offset_middle = PTR_MIDDLE(func);
- gate->offset_high = PTR_HIGH(func);
-}
-
+ gate->bits.ist = ist;
+ gate->reserved = 0;
+ gate->offset_high = (u32) (func >> 32);
#else
-static inline void pack_gate(gate_desc *gate, unsigned char type,
- unsigned long base, unsigned dpl, unsigned flags,
- unsigned short seg)
-{
- gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
-}
-
+ gate->segment = seg;
+ gate->bits.ist = 0;
#endif
+}
static inline int desc_empty(const void *ptr)
{
@@ -185,7 +177,8 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
}
-static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
{
#ifdef CONFIG_X86_64
struct ldttss_desc64 *desc = d;
@@ -193,13 +186,13 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
memset(desc, 0, sizeof(*desc));
desc->limit0 = size & 0xFFFF;
- desc->base0 = PTR_LOW(addr);
- desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->base0 = (u16) addr;
+ desc->base1 = (addr >> 16) & 0xFF;
desc->type = type;
desc->p = 1;
desc->limit1 = (size >> 16) & 0xF;
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
+ desc->base2 = (addr >> 24) & 0xFF;
+ desc->base3 = (u32) (addr >> 32);
#else
pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
#endif
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 49265345d4d2..d684bee8a59a 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -47,20 +47,6 @@ enum {
GATE_TASK = 0x5,
};
-/* 16byte gate */
-struct gate_struct64 {
- u16 offset_low;
- u16 segment;
- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
- u16 offset_middle;
- u32 offset_high;
- u32 zero1;
-} __attribute__((packed));
-
-#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
-#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
-
enum {
DESC_TSS = 0x9,
DESC_LDT = 0x2,
@@ -77,20 +63,51 @@ struct ldttss_desc64 {
u32 zero1;
} __attribute__((packed));
+
#ifdef CONFIG_X86_64
-typedef struct gate_struct64 gate_desc;
typedef struct ldttss_desc64 ldt_desc;
typedef struct ldttss_desc64 tss_desc;
-#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
-#define gate_segment(g) ((g).segment)
#else
-typedef struct desc_struct gate_desc;
typedef struct desc_struct ldt_desc;
typedef struct desc_struct tss_desc;
-#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
-#define gate_segment(g) ((g).a >> 16)
#endif
+struct idt_bits {
+ u16 ist : 3,
+ zero : 5,
+ type : 5,
+ dpl : 2,
+ p : 1;
+} __attribute__((packed));
+
+struct gate_struct {
+ u16 offset_low;
+ u16 segment;
+ struct idt_bits bits;
+ u16 offset_middle;
+#ifdef CONFIG_X86_64
+ u32 offset_high;
+ u32 reserved;
+#endif
+} __attribute__((packed));
+
+typedef struct gate_struct gate_desc;
+
+static inline unsigned long gate_offset(const gate_desc *g)
+{
+#ifdef CONFIG_X86_64
+ return g->offset_low | ((unsigned long)g->offset_middle << 16) |
+ ((unsigned long) g->offset_high << 32);
+#else
+ return g->offset_low | ((unsigned long)g->offset_middle << 16);
+#endif
+}
+
+static inline unsigned long gate_segment(const gate_desc *g)
+{
+ return g->segment;
+}
+
struct desc_ptr {
unsigned short size;
unsigned long address;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a2c95522ac99..7b447d126d17 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8838,7 +8838,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
desc = (gate_desc *)vmx->host_idt_base + vector;
- entry = gate_offset(*desc);
+ entry = gate_offset(desc);
asm volatile(
#ifdef CONFIG_X86_64
"mov %%" _ASM_SP ", %[sp]\n\t"
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 6c279c8f0a0e..49ee3315b9f7 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -591,12 +591,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
{
unsigned long addr;
- if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
+ if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- addr = gate_offset(*val);
+ addr = gate_offset(val);
#ifdef CONFIG_X86_64
/*
* Look for known traps using IST, and substitute them
@@ -629,16 +629,16 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
;
else {
/* Some other trap using IST? */
- if (WARN_ON(val->ist != 0))
+ if (WARN_ON(val->bits.ist != 0))
return 0;
}
#endif /* CONFIG_X86_64 */
info->address = addr;
- info->cs = gate_segment(*val);
- info->flags = val->dpl;
+ info->cs = gate_segment(val);
+ info->flags = val->bits.dpl;
/* interrupt gates clear IF */
- if (val->type == GATE_INTERRUPT)
+ if (val->bits.type == GATE_INTERRUPT)
info->flags |= 1 << 2;
return 1;
--
2.14.2

View File

@ -0,0 +1,93 @@
From a0b37d5a5f250199b6df4e9404d2071802591de6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Aug 2017 08:47:40 +0200
Subject: [PATCH 028/231] x86/asm: Replace access to desc_struct:a/b fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The union inside of desc_struct which allows access to the raw u32 parts of
the descriptors. This raw access part is about to go away.
Replace the few code parts which access those fields.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170828064958.120214366@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9a98e7780022aa7cd201eb8a88a4f1d607b73cde)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 8469c76c61ea9c3b86b596352d1148bace5ea706)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/xen/hypercall.h | 6 ++++--
arch/x86/kernel/tls.c | 2 +-
arch/x86/xen/enlighten_pv.c | 2 +-
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 11071fcd630e..9606688caa4b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,6 +552,8 @@ static inline void
MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
{
+ u32 *p = (u32 *) &desc;
+
mcl->op = __HYPERVISOR_update_descriptor;
if (sizeof(maddr) == sizeof(long)) {
mcl->args[0] = maddr;
@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
} else {
mcl->args[0] = maddr;
mcl->args[1] = maddr >> 32;
- mcl->args[2] = desc.a;
- mcl->args[3] = desc.b;
+ mcl->args[2] = *p++;
+ mcl->args[3] = *p;
}
trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index dcd699baea1b..a106b9719c58 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
- desc->a = desc->b = 0;
+ memset(desc, 0, sizeof(*desc));
} else {
fill_ldt(desc, info);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 49ee3315b9f7..c76f5ff4d0d7 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
- return d1->a == d2->a && d1->b == d2->b;
+ return !memcmp(d1, d2, sizeof(*d1));
}
static void load_TLS_descriptor(struct thread_struct *t,
--
2.14.2

View File

@ -0,0 +1,437 @@
From e5688fb8c2c243658f3fe754d33c7250c8aed146 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 31 Aug 2017 19:42:49 +0200
Subject: [PATCH 029/231] x86/xen: Get rid of paravirt op
adjust_exception_frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
When running as Xen pv-guest the exception frame on the stack contains
%r11 and %rcx additional to the other data pushed by the processor.
Instead of having a paravirt op being called for each exception type
prepend the Xen specific code to each exception entry. When running as
Xen pv-guest just use the exception entry with prepended instructions,
otherwise use the entry without the Xen specific code.
[ tglx: Merged through tip to avoid ugly merge conflict ]
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: xen-devel@lists.xenproject.org
Cc: boris.ostrovsky@oracle.com
Cc: luto@amacapital.net
Link: http://lkml.kernel.org/r/20170831174249.26853-1-jg@pfupf.net
(backported from commit 5878d5d6fdef6447d73b0acc121ba445bef37f53)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 9a6fb927deb3ebbe831741ca82081714637181a7)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/paravirt.h | 5 --
arch/x86/include/asm/paravirt_types.h | 3 --
arch/x86/include/asm/proto.h | 3 ++
arch/x86/include/asm/traps.h | 28 ++++++++--
arch/x86/xen/xen-ops.h | 1 -
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/paravirt.c | 3 --
arch/x86/xen/enlighten_pv.c | 98 +++++++++++++++++++++++------------
arch/x86/xen/irq.c | 3 --
arch/x86/entry/entry_64.S | 23 ++------
arch/x86/entry/entry_64_compat.S | 1 -
arch/x86/xen/xen-asm_64.S | 41 +++++++++++++--
12 files changed, 133 insertions(+), 77 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..c25dd22f7c70 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -960,11 +960,6 @@ extern void default_banner(void);
#define GET_CR2_INTO_RAX \
call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
- CLBR_NONE, \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
-
#define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9ffc36bfe4cd..6b64fc6367f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -196,9 +196,6 @@ struct pv_irq_ops {
void (*safe_halt)(void);
void (*halt)(void);
-#ifdef CONFIG_X86_64
- void (*adjust_exception_frame)(void);
-#endif
} __no_randomize_layout;
struct pv_mmu_ops {
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 8d3964fc5f91..b408b1886195 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_INT80_compat(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+void xen_entry_INT80_compat(void);
+#endif
#endif
void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b4f322d6c95f..feb89dbe359d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,9 +13,6 @@ asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
-asmlinkage void xen_debug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_stack_segment(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
@@ -56,6 +53,31 @@ asmlinkage void simd_coprocessor_error(void);
#define trace_page_fault page_fault
#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xendebug(void);
+asmlinkage void xen_xenint3(void);
+asmlinkage void xen_nmi(void);
+asmlinkage void xen_overflow(void);
+asmlinkage void xen_bounds(void);
+asmlinkage void xen_invalid_op(void);
+asmlinkage void xen_device_not_available(void);
+asmlinkage void xen_double_fault(void);
+asmlinkage void xen_coprocessor_segment_overrun(void);
+asmlinkage void xen_invalid_TSS(void);
+asmlinkage void xen_segment_not_present(void);
+asmlinkage void xen_stack_segment(void);
+asmlinkage void xen_general_protection(void);
+asmlinkage void xen_page_fault(void);
+asmlinkage void xen_spurious_interrupt_bug(void);
+asmlinkage void xen_coprocessor_error(void);
+asmlinkage void xen_alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void xen_machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void xen_simd_coprocessor_error(void);
+#endif
+
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
dotraplinkage void do_nmi(struct pt_regs *, long);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 70301ac0d414..c8a6d224f7ed 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -138,7 +138,6 @@ __visible void xen_restore_fl_direct(unsigned long);
__visible void xen_iret(void);
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
-__visible void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 99332f550c48..cf42206926af 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,7 +20,6 @@ static char syscalls_ia32[] = {
int main(void)
{
#ifdef CONFIG_PARAVIRT
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849589bb..a14df9eecfed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = {
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
-#endif
};
__visible struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c76f5ff4d0d7..ae2a2e2d6362 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -586,6 +586,70 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
preempt_enable();
}
+#ifdef CONFIG_X86_64
+struct trap_array_entry {
+ void (*orig)(void);
+ void (*xen)(void);
+ bool ist_okay;
+};
+
+static struct trap_array_entry trap_array[] = {
+ { debug, xen_xendebug, true },
+ { int3, xen_xenint3, true },
+ { double_fault, xen_double_fault, true },
+#ifdef CONFIG_X86_MCE
+ { machine_check, xen_machine_check, true },
+#endif
+ { nmi, xen_nmi, true },
+ { overflow, xen_overflow, false },
+#ifdef CONFIG_IA32_EMULATION
+ { entry_INT80_compat, xen_entry_INT80_compat, false },
+#endif
+ { page_fault, xen_page_fault, false },
+ { divide_error, xen_divide_error, false },
+ { bounds, xen_bounds, false },
+ { invalid_op, xen_invalid_op, false },
+ { device_not_available, xen_device_not_available, false },
+ { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
+ { invalid_TSS, xen_invalid_TSS, false },
+ { segment_not_present, xen_segment_not_present, false },
+ { stack_segment, xen_stack_segment, false },
+ { general_protection, xen_general_protection, false },
+ { spurious_interrupt_bug, xen_spurious_interrupt_bug, false },
+ { coprocessor_error, xen_coprocessor_error, false },
+ { alignment_check, xen_alignment_check, false },
+ { simd_coprocessor_error, xen_simd_coprocessor_error, false },
+};
+
+static bool get_trap_addr(void **addr, unsigned int ist)
+{
+ unsigned int nr;
+ bool ist_okay = false;
+
+ /*
+ * Replace trap handler addresses by Xen specific ones.
+ * Check for known traps using IST and whitelist them.
+ * The debugger ones are the only ones we care about.
+ * Xen will handle faults like double_fault, * so we should never see
+ * them. Warn if there's an unexpected IST-using fault handler.
+ */
+ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
+ struct trap_array_entry *entry = trap_array + nr;
+
+ if (*addr == entry->orig) {
+ *addr = entry->xen;
+ ist_okay = entry->ist_okay;
+ break;
+ }
+ }
+
+ if (WARN_ON(ist != 0 && !ist_okay))
+ return false;
+
+ return true;
+}
+#endif
+
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
@@ -598,40 +662,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
addr = gate_offset(val);
#ifdef CONFIG_X86_64
- /*
- * Look for known traps using IST, and substitute them
- * appropriately. The debugger ones are the only ones we care
- * about. Xen will handle faults like double_fault,
- * so we should never see them. Warn if
- * there's an unexpected IST-using fault handler.
- */
- if (addr == (unsigned long)debug)
- addr = (unsigned long)xen_debug;
- else if (addr == (unsigned long)int3)
- addr = (unsigned long)xen_int3;
- else if (addr == (unsigned long)stack_segment)
- addr = (unsigned long)xen_stack_segment;
- else if (addr == (unsigned long)double_fault) {
- /* Don't need to handle these */
+ if (!get_trap_addr((void **)&addr, val->bits.ist))
return 0;
-#ifdef CONFIG_X86_MCE
- } else if (addr == (unsigned long)machine_check) {
- /*
- * when xen hypervisor inject vMCE to guest,
- * use native mce handler to handle it
- */
- ;
-#endif
- } else if (addr == (unsigned long)nmi)
- /*
- * Use the native version as well.
- */
- ;
- else {
- /* Some other trap using IST? */
- if (WARN_ON(val->bits.ist != 0))
- return 0;
- }
#endif /* CONFIG_X86_64 */
info->address = addr;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 33e92955e09d..d4eff5676cfa 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
.safe_halt = xen_safe_halt,
.halt = xen_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
};
void __init xen_init_irq_ops(void)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index dfabcbf8e813..c12260ef3e4b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -829,7 +829,6 @@ ENTRY(\sym)
.endif
ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
.ifeq \has_error_code
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -975,7 +974,7 @@ ENTRY(do_softirq_own_stack)
ENDPROC(do_softirq_own_stack)
#ifdef CONFIG_XEN
-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1042,8 +1041,6 @@ ENTRY(xen_failsafe_callback)
movq 8(%rsp), %r11
addq $0x30, %rsp
pushq $0 /* RIP */
- pushq %r11
- pushq %rcx
UNWIND_HINT_IRET_REGS offset=8
jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1074,9 +1071,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xendebug do_debug has_error_code=0
+idtentry xenint3 do_int3 has_error_code=0
#endif
idtentry general_protection do_general_protection has_error_code=1
@@ -1240,20 +1236,9 @@ ENTRY(error_exit)
END(error_exit)
/* Runs on exception stack */
+/* XXX: broken on Xen PV */
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
- /*
- * Fix up the exception frame if we're on Xen.
- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
- * one value to the stack on native, so it may clobber the rdx
- * scratch slot, but it won't clobber any of the important
- * slots past it.
- *
- * Xen is a different story, because the Xen frame itself overlaps
- * the "NMI executing" variable.
- */
- PARAVIRT_ADJUST_EXCEPTION_FRAME
-
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 5314d7b8e5ad..d8468ba24be0 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -293,7 +293,6 @@ ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
*/
- PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 3a3b6a211584..dae2cc33afb5 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -16,11 +16,42 @@
#include <linux/linkage.h>
-ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp), %rcx
- mov 8+8(%rsp), %r11
- ret $16
-ENDPROC(xen_adjust_exception_frame)
+.macro xen_pv_trap name
+ENTRY(xen_\name)
+ pop %rcx
+ pop %r11
+ jmp \name
+END(xen_\name)
+.endm
+
+xen_pv_trap divide_error
+xen_pv_trap debug
+xen_pv_trap xendebug
+xen_pv_trap int3
+xen_pv_trap xenint3
+xen_pv_trap nmi
+xen_pv_trap overflow
+xen_pv_trap bounds
+xen_pv_trap invalid_op
+xen_pv_trap device_not_available
+xen_pv_trap double_fault
+xen_pv_trap coprocessor_segment_overrun
+xen_pv_trap invalid_TSS
+xen_pv_trap segment_not_present
+xen_pv_trap stack_segment
+xen_pv_trap general_protection
+xen_pv_trap page_fault
+xen_pv_trap spurious_interrupt_bug
+xen_pv_trap coprocessor_error
+xen_pv_trap alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+xen_pv_trap hypervisor_callback
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
--
2.14.2

View File

@ -0,0 +1,391 @@
From bbb647f65a627420f8c3351b34d14490a9878509 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 4 Sep 2017 12:25:27 +0200
Subject: [PATCH 030/231] x86/paravirt: Remove no longer used paravirt
functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
With removal of lguest some of the paravirt functions are no longer
needed:
->read_cr4()
->store_idt()
->set_pmd_at()
->set_pud_at()
->pte_update()
Remove them.
Signed-off-by: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akataria@vmware.com
Cc: boris.ostrovsky@oracle.com
Cc: chrisw@sous-sol.org
Cc: jeremy@goop.org
Cc: rusty@rustcorp.com.au
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170904102527.25409-1-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 87930019c713873a1c3b9bd55dde46e81f70c8f1)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit edf3ab0080a6e79a300753e66929b0b7499eaec5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/desc.h | 3 +--
arch/x86/include/asm/paravirt.h | 37 -----------------------------------
arch/x86/include/asm/paravirt_types.h | 9 ---------
arch/x86/include/asm/pgtable.h | 27 ++++---------------------
arch/x86/include/asm/special_insns.h | 10 +++++-----
arch/x86/kernel/paravirt.c | 5 -----
arch/x86/kvm/vmx.c | 2 +-
arch/x86/mm/pgtable.c | 7 +------
arch/x86/xen/enlighten_pv.c | 2 --
arch/x86/xen/mmu_pv.c | 2 --
10 files changed, 12 insertions(+), 92 deletions(-)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 57e502a4e92f..f995e5a09136 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -120,7 +120,6 @@ static inline int desc_empty(const void *ptr)
#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
#define load_TLS(t, cpu) native_load_tls(t, cpu)
@@ -241,7 +240,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr)
asm volatile("sgdt %0":"=m" (*dtr));
}
-static inline void native_store_idt(struct desc_ptr *dtr)
+static inline void store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c25dd22f7c70..12deec722cf0 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -71,11 +71,6 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
-static inline unsigned long __read_cr4(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
-}
-
static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
@@ -228,10 +223,6 @@ static inline void set_ldt(const void *addr, unsigned entries)
{
PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
}
-static inline void store_idt(struct desc_ptr *dtr)
-{
- PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
-}
static inline unsigned long paravirt_store_tr(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
@@ -365,12 +356,6 @@ static inline void paravirt_release_p4d(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
}
-static inline void pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
-}
-
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
@@ -472,28 +457,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
-static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
-{
- if (sizeof(pmdval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
- else
- PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
- native_pmd_val(pmd));
-}
-
-static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
-{
- if (sizeof(pudval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
- else
- PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
- native_pud_val(pud));
-}
-
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6b64fc6367f2..42873edd9f9d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -107,7 +107,6 @@ struct pv_cpu_ops {
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
- unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
#ifdef CONFIG_X86_64
@@ -119,8 +118,6 @@ struct pv_cpu_ops {
void (*load_tr_desc)(void);
void (*load_gdt)(const struct desc_ptr *);
void (*load_idt)(const struct desc_ptr *);
- /* store_gdt has been removed. */
- void (*store_idt)(struct desc_ptr *);
void (*set_ldt)(const void *desc, unsigned entries);
unsigned long (*store_tr)(void);
void (*load_tls)(struct thread_struct *t, unsigned int cpu);
@@ -245,12 +242,6 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmdval);
- void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pudval);
- void (*pte_update)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 77037b6f1caa..bb8e9ea7deb4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -43,8 +43,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
-#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
-#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
@@ -75,8 +73,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd) native_pmd_clear(pmd)
-#define pte_update(mm, addr, ptep) do { } while (0)
-
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -965,31 +961,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
native_set_pte(ptep, pte);
}
-static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp , pmd_t pmd)
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
{
native_set_pmd(pmdp, pmd);
}
-static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud)
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
{
native_set_pud(pudp, pud);
}
-#ifndef CONFIG_PARAVIRT
-/*
- * Rules for using pte_update - it must be called after any PTE update which
- * has not been done using the set_pte / clear_pte interfaces. It is used by
- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
- * updates should either be sets, clears, or set_pte_atomic for P->P
- * transitions, which means this hook should only be called for user PTEs.
- * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.
- */
-#define pte_update(mm, addr, ptep) do { } while (0)
-#endif
-
/*
* We only update the dirty/accessed state if we set
* the dirty bit by hand in the kernel, since the hardware
@@ -1017,7 +1000,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
- pte_update(mm, addr, ptep);
return pte;
}
@@ -1044,7 +1026,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
- pte_update(mm, addr, ptep);
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 9efaabf5b54b..a24dfcf79f4a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -135,6 +135,11 @@ static inline void native_wbinvd(void)
extern asmlinkage void native_load_gs_index(unsigned);
+static inline unsigned long __read_cr4(void)
+{
+ return native_read_cr4();
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -173,11 +178,6 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}
-static inline unsigned long __read_cr4(void)
-{
- return native_read_cr4();
-}
-
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a14df9eecfed..19a3e8f961c7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -327,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_debugreg = native_set_debugreg,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
- .read_cr4 = native_read_cr4,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
@@ -343,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
- .store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
#ifdef CONFIG_X86_64
@@ -411,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
- .set_pmd_at = native_set_pmd_at,
- .pte_update = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
@@ -424,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
- .set_pud_at = native_set_pud_at,
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7b447d126d17..dd4996a96c71 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5174,7 +5174,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- native_store_idt(&dt);
+ store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
vmx->host_idt_base = dt.address;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..942391b5b639 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -426,10 +426,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
{
int changed = !pte_same(*ptep, entry);
- if (changed && dirty) {
+ if (changed && dirty)
*ptep = entry;
- pte_update(vma->vm_mm, address, ptep);
- }
return changed;
}
@@ -486,9 +484,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
- if (ret)
- pte_update(vma->vm_mm, addr, ptep);
-
return ret;
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ae2a2e2d6362..69b9deff7e5c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1038,7 +1038,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
- .read_cr4 = native_read_cr4,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
@@ -1073,7 +1072,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
- .store_idt = native_store_idt,
.store_tr = xen_store_tr,
.write_ldt_entry = xen_write_ldt_entry,
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cab28cf2cffb..5f61b7e2e6b2 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2430,8 +2430,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
- .pte_update = paravirt_nop,
-
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
--
2.14.2

View File

@ -0,0 +1,53 @@
From b368fed558634ffc92dba0d7d9e4e631d26cd92f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 20 Oct 2017 11:21:33 -0500
Subject: [PATCH 031/231] x86/entry: Fix idtentry unwind hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This fixes the following ORC warning in the 'int3' entry code:
WARNING: can't dereference iret registers at ffff8801c5f17fe0 for ip ffffffff95f0d94b
The ORC metadata had the wrong stack offset for the iret registers.
Their location on the stack is dependent on whether the exception has an
error code.
Reported-and-tested-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations")
Link: http://lkml.kernel.org/r/931d57f0551ed7979d5e7e05370d445c8e5137f8.1508516398.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 98990a33b77dda9babf91cb235654f6729e5702e)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 266be2a5053230f6d0b6f27d3e8e9f28df40dd7e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c12260ef3e4b..2e4fc6425f47 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -821,7 +821,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- UNWIND_HINT_IRET_REGS offset=8
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
/* Sanity check */
.if \shift_ist != -1 && \paranoid == 0
--
2.14.2

View File

@ -0,0 +1,237 @@
From d9fd6653e5dd9d80c7c75065329250529281e02d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 10 Sep 2017 17:48:27 -0700
Subject: [PATCH 032/231] x86/mm/64: Initialize CR4.PCIDE early
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
cpu_init() is weird: it's called rather late (after early
identification and after most MMU state is initialized) on the boot
CPU but is called extremely early (before identification) on secondary
CPUs. It's called just late enough on the boot CPU that its CR4 value
isn't propagated to mmu_cr4_features.
Even if we put CR4.PCIDE into mmu_cr4_features, we'd hit two
problems. First, we'd crash in the trampoline code. That's
fixable, and I tried that. It turns out that mmu_cr4_features is
totally ignored by secondary_start_64(), though, so even with the
trampoline code fixed, it wouldn't help.
This means that we don't currently have CR4.PCIDE reliably initialized
before we start playing with cpu_tlbstate. This is very fragile and
tends to cause boot failures if I make even small changes to the TLB
handling code.
Make it more robust: initialize CR4.PCIDE earlier on the boot CPU
and propagate it to secondary CPUs in start_secondary().
( Yes, this is ugly. I think we should have improved mmu_cr4_features
to actually control CR4 during secondary bootup, but that would be
fairly intrusive at this stage. )
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Tested-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit c7ad5ad297e644601747d6dbee978bf85e14f7bc)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 0e6a37a43aa876327e7d21881c09977da2d5c270)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/cpu/common.c | 49 +++++++-------------------------------------
arch/x86/kernel/setup.c | 5 ++++-
arch/x86/kernel/smpboot.c | 8 +++++---
arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++
4 files changed, 50 insertions(+), 46 deletions(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0b80ed14ff52..4be7b209a3d6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -169,21 +169,21 @@ static int __init x86_mpx_setup(char *s)
__setup("nompx", x86_mpx_setup);
#ifdef CONFIG_X86_64
-static int __init x86_pcid_setup(char *s)
+static int __init x86_nopcid_setup(char *s)
{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
/* do not emit a message if the feature is not present */
if (!boot_cpu_has(X86_FEATURE_PCID))
- return 1;
+ return 0;
setup_clear_cpu_cap(X86_FEATURE_PCID);
pr_info("nopcid: PCID feature disabled\n");
- return 1;
+ return 0;
}
-__setup("nopcid", x86_pcid_setup);
+early_param("nopcid", x86_nopcid_setup);
#endif
static int __init x86_noinvpcid_setup(char *s)
@@ -329,38 +329,6 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
-static void setup_pcid(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_PCID)) {
- if (cpu_has(c, X86_FEATURE_PGE)) {
- /*
- * We'd like to use cr4_set_bits_and_update_boot(),
- * but we can't. CR4.PCIDE is special and can only
- * be set in long mode, and the early CPU init code
- * doesn't know this and would try to restore CR4.PCIDE
- * prior to entering long mode.
- *
- * Instead, we rely on the fact that hotplug, resume,
- * etc all fully restore CR4 before they write anything
- * that could have nonzero PCID bits to CR3. CR4.PCIDE
- * has no effect on the page tables themselves, so we
- * don't need it to be restored early.
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
- clear_cpu_cap(c, X86_FEATURE_PCID);
- }
- }
-}
-
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1175,9 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
- /* Set up PCID */
- setup_pcid(c);
-
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7e8b983aa72..f964bfddfefd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1174,8 +1174,11 @@ void __init setup_arch(char **cmdline_p)
* with the current CR4 value. This may not be necessary, but
* auditing all the early-boot CR4 manipulation would be needed to
* rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
*/
- mmu_cr4_features = __read_cr4();
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
memblock_set_current_limit(get_max_mapped());
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 893fd8c849e2..d05006f6c31c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -227,10 +227,12 @@ static int enable_start_cpu0;
static void notrace start_secondary(void *unused)
{
/*
- * Don't put *anything* before cpu_init(), SMP booting is too
- * fragile that we want to limit the things done here to the
- * most necessary things.
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
*/
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ __write_cr4(__read_cr4() | X86_CR4_PCIDE);
cpu_init();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bf3f1065d6ad..df2624b091a7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -19,6 +19,7 @@
#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
+#include <asm/cpufeature.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void)
}
}
+static void setup_pcid(void)
+{
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_PCID)) {
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() --
+ * the trampoline code can't handle CR4.PCIDE and
+ * it wouldn't do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually
+ * cause the bits in question to remain set all the
+ * way through the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE
+ * manually in start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ }
+ }
+#endif
+}
+
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -592,6 +625,7 @@ void __init init_mem_mapping(void)
unsigned long end;
probe_page_size_mask();
+ setup_pcid();
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT;
--
2.14.2

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,641 @@
From 338c7d8678b82c46668ce3b73f7339f71ab69cc8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 11 Jul 2017 10:33:43 -0500
Subject: [PATCH 034/231] objtool, x86: Add facility for asm code to provide
unwind hints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Some asm (and inline asm) code does special things to the stack which
objtool can't understand. (Nor can GCC or GNU assembler, for that
matter.) In such cases we need a facility for the code to provide
annotations, so the unwinder can unwind through it.
This provides such a facility, in the form of unwind hints. They're
similar to the GNU assembler .cfi* directives, but they give more
information, and are needed in far fewer places, because objtool can
fill in the blanks by following branches and adjusting the stack pointer
for pushes and pops.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/0f5f3c9104fca559ff4088bece1d14ae3bca52d5.1499786555.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 39358a033b2e4432052265c1fa0f36f572d8cfb5)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a1fed2e10e84d48643a09861c2d127968621813e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
tools/objtool/Makefile | 3 +
arch/x86/include/asm/orc_types.h | 107 ++++++++++++++++++++
arch/x86/include/asm/unwind_hints.h | 103 +++++++++++++++++++
tools/objtool/check.h | 4 +-
tools/objtool/orc_types.h | 22 +++++
tools/objtool/check.c | 191 +++++++++++++++++++++++++++++++++---
6 files changed, 417 insertions(+), 13 deletions(-)
create mode 100644 arch/x86/include/asm/orc_types.h
create mode 100644 arch/x86/include/asm/unwind_hints.h
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 0e2765e243c0..3a6425fefc43 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -52,6 +52,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \
diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \
|| echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true
+ @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \
+ diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \
+ || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true
$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..7dc777a6cb40
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED 0
+#define ORC_REG_PREV_SP 1
+#define ORC_REG_DX 2
+#define ORC_REG_DI 3
+#define ORC_REG_BP 4
+#define ORC_REG_SP 5
+#define ORC_REG_R10 6
+#define ORC_REG_R13 7
+#define ORC_REG_BP_INDIRECT 8
+#define ORC_REG_SP_INDIRECT 9
+#define ORC_REG_MAX 15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL 0
+#define ORC_TYPE_REGS 1
+#define ORC_TYPE_REGS_IRET 2
+#define UNWIND_HINT_TYPE_SAVE 3
+#define UNWIND_HINT_TYPE_RESTORE 4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard. It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address. Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+ s16 sp_offset;
+ s16 bp_offset;
+ unsigned sp_reg:4;
+ unsigned bp_reg:4;
+ unsigned type:2;
+};
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..5e02b11c9b86
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,103 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest. The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack. Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
+ * category. In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer. Such
+ * code needs to be annotated such that objtool can understand it. The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction. Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary. It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+ .pushsection .discard.unwind_hints
+ /* struct unwind_hint */
+ .long .Lunwind_hint_ip_\@ - .
+ .short \sp_offset
+ .byte \sp_reg
+ .byte \type
+ .popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+ .if \base == %rsp && \indirect
+ .set sp_reg, ORC_REG_SP_INDIRECT
+ .elseif \base == %rsp
+ .set sp_reg, ORC_REG_SP
+ .elseif \base == %rbp
+ .set sp_reg, ORC_REG_BP
+ .elseif \base == %rdi
+ .set sp_reg, ORC_REG_DI
+ .elseif \base == %rdx
+ .set sp_reg, ORC_REG_DX
+ .elseif \base == %r10
+ .set sp_reg, ORC_REG_R10
+ .else
+ .error "UNWIND_HINT_REGS: bad base register"
+ .endif
+
+ .set sp_offset, \offset
+
+ .if \iret
+ .set type, ORC_TYPE_REGS_IRET
+ .elseif \extra == 0
+ .set type, ORC_TYPE_REGS_IRET
+ .set sp_offset, \offset + (16*8)
+ .else
+ .set type, ORC_TYPE_REGS
+ .endif
+
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+ UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+ UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type) \
+ "987: \n\t" \
+ ".pushsection .discard.unwind_hints\n\t" \
+ /* struct unwind_hint */ \
+ ".long 987b - .\n\t" \
+ ".short " __stringify(sp_offset) "\n\t" \
+ ".byte " __stringify(sp_reg) "\n\t" \
+ ".byte " __stringify(type) "\n\t" \
+ ".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index 046874bbe226..ac3d4b13f17b 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -43,7 +43,7 @@ struct instruction {
unsigned int len;
unsigned char type;
unsigned long immediate;
- bool alt_group, visited, dead_end, ignore;
+ bool alt_group, visited, dead_end, ignore, hint, save, restore;
struct symbol *call_dest;
struct instruction *jump_dest;
struct list_head alts;
@@ -58,7 +58,7 @@ struct objtool_file {
struct list_head insn_list;
DECLARE_HASHTABLE(insn_hash, 16);
struct section *rodata, *whitelist;
- bool ignore_unreachables, c_file;
+ bool ignore_unreachables, c_file, hints;
};
int check(const char *objname, bool nofp, bool orc);
diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h
index fc5cf6cffd9a..9c9dc579bd7d 100644
--- a/tools/objtool/orc_types.h
+++ b/tools/objtool/orc_types.h
@@ -61,11 +61,19 @@
*
* ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
* points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
*/
#define ORC_TYPE_CALL 0
#define ORC_TYPE_REGS 1
#define ORC_TYPE_REGS_IRET 2
+#define UNWIND_HINT_TYPE_SAVE 3
+#define UNWIND_HINT_TYPE_RESTORE 4
+#ifndef __ASSEMBLY__
/*
* This struct is more or less a vastly simplified version of the DWARF Call
* Frame Information standard. It contains only the necessary parts of DWARF
@@ -82,4 +90,18 @@ struct orc_entry {
unsigned type:2;
} __packed;
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+};
+#endif /* __ASSEMBLY__ */
+
#endif /* _ORC_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index cb57c526ba17..368275de5f23 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -100,7 +100,6 @@ static bool gcov_enabled(struct objtool_file *file)
static bool ignore_func(struct objtool_file *file, struct symbol *func)
{
struct rela *rela;
- struct instruction *insn;
/* check for STACK_FRAME_NON_STANDARD */
if (file->whitelist && file->whitelist->rela)
@@ -113,11 +112,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
return true;
}
- /* check if it has a context switching instruction */
- func_for_each_insn(file, func, insn)
- if (insn->type == INSN_CONTEXT_SWITCH)
- return true;
-
return false;
}
@@ -879,6 +873,99 @@ static int add_switch_table_alts(struct objtool_file *file)
return 0;
}
+static int read_unwind_hints(struct objtool_file *file)
+{
+ struct section *sec, *relasec;
+ struct rela *rela;
+ struct unwind_hint *hint;
+ struct instruction *insn;
+ struct cfi_reg *cfa;
+ int i;
+
+ sec = find_section_by_name(file->elf, ".discard.unwind_hints");
+ if (!sec)
+ return 0;
+
+ relasec = sec->rela;
+ if (!relasec) {
+ WARN("missing .rela.discard.unwind_hints section");
+ return -1;
+ }
+
+ if (sec->len % sizeof(struct unwind_hint)) {
+ WARN("struct unwind_hint size mismatch");
+ return -1;
+ }
+
+ file->hints = true;
+
+ for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
+ hint = (struct unwind_hint *)sec->data->d_buf + i;
+
+ rela = find_rela_by_dest(sec, i * sizeof(*hint));
+ if (!rela) {
+ WARN("can't find rela for unwind_hints[%d]", i);
+ return -1;
+ }
+
+ insn = find_insn(file, rela->sym->sec, rela->addend);
+ if (!insn) {
+ WARN("can't find insn for unwind_hints[%d]", i);
+ return -1;
+ }
+
+ cfa = &insn->state.cfa;
+
+ if (hint->type == UNWIND_HINT_TYPE_SAVE) {
+ insn->save = true;
+ continue;
+
+ } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) {
+ insn->restore = true;
+ insn->hint = true;
+ continue;
+ }
+
+ insn->hint = true;
+
+ switch (hint->sp_reg) {
+ case ORC_REG_UNDEFINED:
+ cfa->base = CFI_UNDEFINED;
+ break;
+ case ORC_REG_SP:
+ cfa->base = CFI_SP;
+ break;
+ case ORC_REG_BP:
+ cfa->base = CFI_BP;
+ break;
+ case ORC_REG_SP_INDIRECT:
+ cfa->base = CFI_SP_INDIRECT;
+ break;
+ case ORC_REG_R10:
+ cfa->base = CFI_R10;
+ break;
+ case ORC_REG_R13:
+ cfa->base = CFI_R13;
+ break;
+ case ORC_REG_DI:
+ cfa->base = CFI_DI;
+ break;
+ case ORC_REG_DX:
+ cfa->base = CFI_DX;
+ break;
+ default:
+ WARN_FUNC("unsupported unwind_hint sp base reg %d",
+ insn->sec, insn->offset, hint->sp_reg);
+ return -1;
+ }
+
+ cfa->offset = hint->sp_offset;
+ insn->state.type = hint->type;
+ }
+
+ return 0;
+}
+
static int decode_sections(struct objtool_file *file)
{
int ret;
@@ -909,6 +996,10 @@ static int decode_sections(struct objtool_file *file)
if (ret)
return ret;
+ ret = read_unwind_hints(file);
+ if (ret)
+ return ret;
+
return 0;
}
@@ -1382,7 +1473,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
struct insn_state state)
{
struct alternative *alt;
- struct instruction *insn;
+ struct instruction *insn, *next_insn;
struct section *sec;
struct symbol *func = NULL;
int ret;
@@ -1397,6 +1488,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
}
while (1) {
+ next_insn = next_insn_same_sec(file, insn);
+
if (file->c_file && insn->func) {
if (func && func != insn->func) {
WARN("%s() falls through to next function %s()",
@@ -1414,13 +1507,54 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
}
if (insn->visited) {
- if (!!insn_state_match(insn, &state))
+ if (!insn->hint && !insn_state_match(insn, &state))
return 1;
return 0;
}
- insn->state = state;
+ if (insn->hint) {
+ if (insn->restore) {
+ struct instruction *save_insn, *i;
+
+ i = insn;
+ save_insn = NULL;
+ func_for_each_insn_continue_reverse(file, func, i) {
+ if (i->save) {
+ save_insn = i;
+ break;
+ }
+ }
+
+ if (!save_insn) {
+ WARN_FUNC("no corresponding CFI save for CFI restore",
+ sec, insn->offset);
+ return 1;
+ }
+
+ if (!save_insn->visited) {
+ /*
+ * Oops, no state to copy yet.
+ * Hopefully we can reach this
+ * instruction from another branch
+ * after the save insn has been
+ * visited.
+ */
+ if (insn == first)
+ return 0;
+
+ WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo",
+ sec, insn->offset);
+ return 1;
+ }
+
+ insn->state = save_insn->state;
+ }
+
+ state = insn->state;
+
+ } else
+ insn->state = state;
insn->visited = true;
@@ -1497,6 +1631,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
return 0;
+ case INSN_CONTEXT_SWITCH:
+ if (func && (!next_insn || !next_insn->hint)) {
+ WARN_FUNC("unsupported instruction in callable function",
+ sec, insn->offset);
+ return 1;
+ }
+ return 0;
+
case INSN_STACK:
if (update_insn_state(insn, &state))
return -1;
@@ -1510,7 +1652,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
if (insn->dead_end)
return 0;
- insn = next_insn_same_sec(file, insn);
+ insn = next_insn;
if (!insn) {
WARN("%s: unexpected end of section", sec->name);
return 1;
@@ -1520,6 +1662,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
return 0;
}
+static int validate_unwind_hints(struct objtool_file *file)
+{
+ struct instruction *insn;
+ int ret, warnings = 0;
+ struct insn_state state;
+
+ if (!file->hints)
+ return 0;
+
+ clear_insn_state(&state);
+
+ for_each_insn(file, insn) {
+ if (insn->hint && !insn->visited) {
+ ret = validate_branch(file, insn, state);
+ warnings += ret;
+ }
+ }
+
+ return warnings;
+}
+
static bool is_kasan_insn(struct instruction *insn)
{
return (insn->type == INSN_CALL &&
@@ -1665,8 +1828,9 @@ int check(const char *_objname, bool _nofp, bool orc)
hash_init(file.insn_hash);
file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
file.rodata = find_section_by_name(file.elf, ".rodata");
- file.ignore_unreachables = false;
file.c_file = find_section_by_name(file.elf, ".comment");
+ file.ignore_unreachables = false;
+ file.hints = false;
arch_initial_func_cfi_state(&initial_func_cfi);
@@ -1683,6 +1847,11 @@ int check(const char *_objname, bool _nofp, bool orc)
goto out;
warnings += ret;
+ ret = validate_unwind_hints(&file);
+ if (ret < 0)
+ goto out;
+ warnings += ret;
+
if (!warnings) {
ret = validate_reachable_instructions(&file);
if (ret < 0)
--
2.14.2

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,171 @@
From 73cf1dd35105d9cf270caf4a72b400b0a3ab4bb2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 25 Jul 2017 08:54:24 -0500
Subject: [PATCH 036/231] x86/kconfig: Consolidate unwinders into multiple
choice selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
There are three mutually exclusive unwinders. Make that more obvious by
combining them into a multiple-choice selection:
CONFIG_FRAME_POINTER_UNWINDER
CONFIG_ORC_UNWINDER
CONFIG_GUESS_UNWINDER (if CONFIG_EXPERT=y)
Frame pointers are still the default (for now).
The old CONFIG_FRAME_POINTER option is still used in some
arch-independent places, so keep it around, but make it
invisible to the user on x86 - it's now selected by
CONFIG_FRAME_POINTER_UNWINDER=y.
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/20170725135424.zukjmgpz3plf5pmt@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 81d387190039c14edac8de2b3ec789beb899afd9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 26ddacc1e6333555e4a6bd63c4c935b323509f92)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/unwind.h | 4 ++--
arch/x86/Kconfig | 3 +--
arch/x86/Kconfig.debug | 45 +++++++++++++++++++++++++++++++++++++------
arch/x86/configs/tiny.config | 2 ++
4 files changed, 44 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 25b8d31a007d..e9f793e2df7a 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -16,7 +16,7 @@ struct unwind_state {
bool signal, full_regs;
unsigned long sp, bp, ip;
struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER)
+#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
__unwind_start(state, task, regs, first_frame);
}
-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER)
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6f45f6d1054..3a0b8cb57caf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -73,7 +73,6 @@ config X86
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
@@ -169,7 +168,7 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d5bca2ec8a74..c441b5d65ec8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -356,6 +356,29 @@ config PUNIT_ATOM_DEBUG
The current power state can be read from
/sys/kernel/debug/punit_atom/dev_power_state
+choice
+ prompt "Choose kernel unwinder"
+ default FRAME_POINTER_UNWINDER
+ ---help---
+ This determines which method will be used for unwinding kernel stack
+ traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+ livepatch, lockdep, and more.
+
+config FRAME_POINTER_UNWINDER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
config ORC_UNWINDER
bool "ORC unwinder"
depends on X86_64
@@ -373,12 +396,22 @@ config ORC_UNWINDER
Enabling this option will increase the kernel's runtime memory usage
by roughly 2-4MB, depending on your kernel config.
-config FRAME_POINTER_UNWINDER
- def_bool y
- depends on !ORC_UNWINDER && FRAME_POINTER
-
config GUESS_UNWINDER
- def_bool y
- depends on !ORC_UNWINDER && !FRAME_POINTER
+ bool "Guess unwinder"
+ depends on EXPERT
+ ---help---
+ This option enables the "guess" unwinder for unwinding kernel stack
+ traces. It scans the stack and reports every kernel text address it
+ finds. Some of the addresses it reports may be incorrect.
+
+ While this option often produces false positives, it can still be
+ useful in many cases. Unlike the other unwinders, it has no runtime
+ overhead.
+
+endchoice
+
+config FRAME_POINTER
+ depends on !ORC_UNWINDER && !GUESS_UNWINDER
+ bool
endmenu
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4b429df40d7a..550cd5012b73 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,3 +1,5 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
+CONFIG_GUESS_UNWINDER=y
+# CONFIG_FRAME_POINTER_UNWINDER is not set
--
2.14.2

View File

@ -0,0 +1,51 @@
From 2c9eb7028c0714c3379b58a59c60f7b5b7a5adb0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Oct 2017 20:10:36 -0500
Subject: [PATCH 037/231] objtool: Upgrade libelf-devel warning to error for
CONFIG_ORC_UNWINDER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
With CONFIG_ORC_UNWINDER, if the user doesn't have libelf-devel
installed, and they don't see the make warning, their ORC unwinder will
be silently broken. Upgrade the warning to an error.
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d9dfc39fb8240998820f9efb233d283a1ee96084.1507079417.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3dd40cb320fee7c23b574ab821ce140ccd1281c9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit c413466a72ca533ec126ebc0c5bb579ae0c96b1d)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
Makefile | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index 8e14a926fc94..490ce18685ea 100644
--- a/Makefile
+++ b/Makefile
@@ -965,7 +965,11 @@ ifdef CONFIG_STACK_VALIDATION
ifeq ($(has_libelf),1)
objtool_target := tools/objtool FORCE
else
- $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ ifdef CONFIG_ORC_UNWINDER
+ $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ else
+ $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ endif
SKIP_STACK_VALIDATION := 1
export SKIP_STACK_VALIDATION
endif
--
2.14.2

View File

@ -0,0 +1,82 @@
From 34aa933a9bce5fb9c88e6ed98b268cbf058e51eb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 26 Jul 2017 07:16:30 -0700
Subject: [PATCH 038/231] x86/ldt/64: Refresh DS and ES when modify_ldt changes
an entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
On x86_32, modify_ldt() implicitly refreshes the cached DS and ES
segments because they are refreshed on return to usermode.
On x86_64, they're not refreshed on return to usermode. To improve
determinism and match x86_32's behavior, refresh them when we update
the LDT.
This avoids a situation in which the DS points to a descriptor that is
changed but the old cached segment persists until the next reschedule.
If this happens, then the user-visible state will change
nondeterministically some time after modify_ldt() returns, which is
unfortunate.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Chang Seok <chang.seok.bae@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit a632375764aa25c97b78beb56c71b0ba59d1cf83)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 295cb0b06150958ec84ee4b8844ef7e389e22c4e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/ldt.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *__mm)
{
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+ refresh_ldt_segments();
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
--
2.14.2

View File

@ -0,0 +1,182 @@
From 09fedd9befc7affbfa9490ef3993d60c7d582a6f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:15 -0700
Subject: [PATCH 039/231] x86/mm: Give each mm TLB flush generation a unique ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This adds two new variables to mmu_context_t: ctx_id and tlb_gen.
ctx_id uniquely identifies the mm_struct and will never be reused.
For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic
count of the number of times that a TLB flush has been requested.
The pair (ctx_id, tlb_gen) can be used as an identifier for TLB
flush actions and will be used in subsequent patches to reliably
determine whether all needed TLB flushes have occurred on a given
CPU.
This patch is split out for ease of review. By itself, it has no
real effect other than creating and updating the new variables.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit f39681ed0f48498b80455095376f11535feea332)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit e566a0dfbb2a5f7ea90dd66ce384740372739e14)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu.h | 25 +++++++++++++++++++++++--
arch/x86/include/asm/mmu_context.h | 6 ++++++
arch/x86/include/asm/tlbflush.h | 18 ++++++++++++++++++
arch/x86/mm/tlb.c | 6 ++++--
4 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 79b647a7ebd0..bb8c597c2248 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/atomic.h>
/*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
*/
typedef struct {
+ /*
+ * ctx_id uniquely identifies this mm_struct. A ctx_id will never
+ * be reused, and zero is not a valid ctx_id.
+ */
+ u64 ctx_id;
+
+ /*
+ * Any code that needs to do any sort of TLB flushing for this
+ * mm will first make its changes to the page tables, then
+ * increment tlb_gen, then flush. This lets the low-level
+ * flushing code keep track of what needs flushing.
+ *
+ * This is not used on Xen PV.
+ */
+ atomic64_t tlb_gen;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
#endif
@@ -37,6 +53,11 @@ typedef struct {
#endif
} mm_context_t;
+#define INIT_MM_CONTEXT(mm) \
+ .context = { \
+ .ctx_id = 1, \
+ }
+
void leave_mm(int cpu);
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 7a234be7e298..6c05679c715b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
#ifndef CONFIG_PARAVIRT
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
@@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+ atomic64_set(&mm->context.tlb_gen, 0);
+
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 2b3d68093235..f1f2e73b7b77 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+ u64 new_tlb_gen;
+
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ smp_mb__before_atomic();
+ new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+ smp_mb__after_atomic();
+
+ return new_tlb_gen;
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -270,6 +287,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
+ inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..14f4f8f66aa8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
cpu = get_cpu();
- /* Synchronize with switch_mm. */
- smp_mb();
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
--
2.14.2

View File

@ -0,0 +1,279 @@
From c1f19d153ad69363ac1bc62bbd9be05ca48c526c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:16 -0700
Subject: [PATCH 040/231] x86/mm: Track the TLB's tlb_gen and update the
flushing algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
There are two kernel features that would benefit from tracking
how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
it up to date in real time:
- Lazy mm switching currently works by switching to init_mm when
it would otherwise flush. This is wasteful: there isn't fundamentally
any need to update CR3 at all when going lazy or when returning from
lazy mode, nor is there any need to receive flush IPIs at all. Instead,
we should just stop trying to keep the TLB coherent when we go lazy and,
when unlazying, check whether we missed any flushes.
- PCID will let us keep recent user contexts alive in the TLB. If we
start doing this, we need a way to decide whether those contexts are
up to date.
On some paravirt systems, remote TLBs can be flushed without IPIs.
This won't update the target CPUs' tlb_gens, which may cause
unnecessary local flushes later on. We can address this if it becomes
a problem by carefully updating the target CPU's tlb_gen directly.
By itself, this patch is a very minor optimization that avoids
unnecessary flushes when multiple TLB flushes targetting the same CPU
race. The complexity in this patch would not be worth it on its own,
but it will enable improved lazy TLB tracking and PCID.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/tlbflush.h | 43 +++++++++++++++--
arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++---
2 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index f1f2e73b7b77..3a167c214560 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+struct tlb_context {
+ u64 ctx_id;
+ u64 tlb_gen;
+};
+
struct tlb_state {
/*
* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -97,6 +102,21 @@ struct tlb_state {
* disabling interrupts when modifying either one.
*/
unsigned long cr4;
+
+ /*
+ * This is a list of all contexts that might exist in the TLB.
+ * Since we don't yet use PCID, there is only one context.
+ *
+ * For each context, ctx_id indicates which mm the TLB's user
+ * entries came from. As an invariant, the TLB will never
+ * contain entries that are out-of-date as when that mm reached
+ * the tlb_gen in the list.
+ *
+ * To be clear, this means that it's legal for the TLB code to
+ * flush the TLB without updating tlb_gen. This can happen
+ * (for now, at least) due to paravirt remote flushes.
+ */
+ struct tlb_context ctxs[1];
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
@@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr)
* and page-granular flushes are available only on i486 and up.
*/
struct flush_tlb_info {
- struct mm_struct *mm;
- unsigned long start;
- unsigned long end;
+ /*
+ * We support several kinds of flushes.
+ *
+ * - Fully flush a single mm. .mm will be set, .end will be
+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+ * which the IPI sender is trying to catch us up.
+ *
+ * - Partially flush a single mm. .mm will be set, .start and
+ * .end will indicate the range, and .new_tlb_gen will be set
+ * such that the changes between generation .new_tlb_gen-1 and
+ * .new_tlb_gen are entirely contained in the indicated range.
+ *
+ * - Fully flush all mms whose tlb_gens have been updated. .mm
+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+ * will be zero.
+ */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+ u64 new_tlb_gen;
};
#define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 14f4f8f66aa8..4e5a5ddb9e4d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
switch_ldt(real_prev, next);
}
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen. We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
static void flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
+ /*
+ * We have three different tlb_gen values in here. They are:
+ *
+ * - mm_tlb_gen: the latest generation.
+ * - local_tlb_gen: the generation that this CPU has already caught
+ * up to.
+ * - f->new_tlb_gen: the generation that the requester of the flush
+ * wants us to catch up to.
+ */
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+ loaded_mm->context.ctx_id);
+
if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+ /*
+ * leave_mm() is adequate to handle any type of flush, and
+ * we would prefer not to receive further IPIs. leave_mm()
+ * clears this CPU's bit in mm_cpumask().
+ */
leave_mm(smp_processor_id());
return;
}
- if (f->end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- if (local)
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
- } else {
+ if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+ /*
+ * There's nothing to do: we're already up to date. This can
+ * happen if two concurrent flushes happen -- the first flush to
+ * be handled can catch us all the way up, leaving no work for
+ * the second flush.
+ */
+ return;
+ }
+
+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+ /*
+ * If we get to this point, we know that our TLB is out of date.
+ * This does not strictly imply that we need to flush (it's
+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+ * going to need to flush in the very near future, so we might
+ * as well get it over with.
+ *
+ * The only question is whether to do a full or partial flush.
+ *
+ * We do a partial flush if requested and two extra conditions
+ * are met:
+ *
+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
+ * we've always done all needed flushes to catch up to
+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and
+ * f->new_tlb_gen == 3, then we know that the flush needed to bring
+ * us up to date for tlb_gen 3 is the partial flush we're
+ * processing.
+ *
+ * As an example of why this check is needed, suppose that there
+ * are two concurrent flushes. The first is a full flush that
+ * changes context.tlb_gen from 1 to 2. The second is a partial
+ * flush that changes context.tlb_gen from 2 to 3. If they get
+ * processed on this CPU in reverse order, we'll see
+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+ * If we were to use __flush_tlb_single() and set local_tlb_gen to
+ * 3, we'd be break the invariant: we'd update local_tlb_gen above
+ * 1 without the full flush that's needed for tlb_gen 2.
+ *
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * Partial TLB flushes are not all that much cheaper than full TLB
+ * flushes, so it seems unlikely that it would be a performance win
+ * to do a partial flush if that won't bring our TLB fully up to
+ * date. By doing a full flush instead, we can increase
+ * local_tlb_gen all the way to mm_tlb_gen and we can probably
+ * avoid another flush in the very near future.
+ */
+ if (f->end != TLB_FLUSH_ALL &&
+ f->new_tlb_gen == local_tlb_gen + 1 &&
+ f->new_tlb_gen == mm_tlb_gen) {
+ /* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
addr = f->start;
while (addr < f->end) {
__flush_tlb_single(addr);
@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
+ } else {
+ /* Full flush. */
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
+
+ /* Both paths above update our state to mm_tlb_gen. */
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
- inc_mm_tlb_gen(mm);
+ info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
--
2.14.2

View File

@ -0,0 +1,454 @@
From caa3549fe709971498eaf080c1710ef627a0df5a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:17 -0700
Subject: [PATCH 041/231] x86/mm: Rework lazy TLB mode and TLB freshness
tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 94b1b03b519b81c494900cb112aa00ed205cc2d9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b381b7ae452f2bc6384507a897247be7c93a71cc)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu_context.h | 6 +-
arch/x86/include/asm/tlbflush.h | 4 -
arch/x86/mm/init.c | 1 -
arch/x86/mm/tlb.c | 197 ++++++++++++++++++++++---------------
arch/x86/xen/mmu_pv.c | 5 +-
5 files changed, 124 insertions(+), 89 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 6c05679c715b..d6b055b328f2 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+ cpumask_clear_cpu(cpu, mm_cpumask(mm));
}
static inline int init_new_context(struct task_struct *tsk,
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 3a167c214560..6397275008db 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -95,7 +95,6 @@ struct tlb_state {
* mode even if we've already switched back to swapper_pg_dir.
*/
struct mm_struct *loaded_mm;
- int state;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -318,9 +317,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
-#define TLBSTATE_OK 1
-#define TLBSTATE_LAZY 2
-
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index df2624b091a7..c86dc071bb10 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -849,7 +849,6 @@ void __init zone_sizes_init(void)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
- .state = 0,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4e5a5ddb9e4d..0982c997d36f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -45,8 +45,8 @@ void leave_mm(int cpu)
if (loaded_mm == &init_mm)
return;
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- BUG();
+ /* Warn if we're not lazy. */
+ WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
switch_mm(NULL, &init_mm, NULL);
}
@@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- unsigned cpu = smp_processor_id();
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ unsigned cpu = smp_processor_id();
+ u64 next_tlb_gen;
/*
- * NB: The scheduler will call us with prev == next when
- * switching from lazy TLB mode to normal mode if active_mm
- * isn't changing. When this happens, there is no guarantee
- * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ * NB: The scheduler will call us with prev == next when switching
+ * from lazy TLB mode to normal mode if active_mm isn't changing.
+ * When this happens, we don't assume that CR3 (and hence
+ * cpu_tlbstate.loaded_mm) matches next.
*
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ /* We don't want flush_tlb_func_* to run concurrently with us. */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * Verify that CR3 is what we think it is. This will catch
+ * hypothetical buggy code that directly switches to swapper_pg_dir
+ * without going through leave_mm() / switch_mm_irqs_off().
+ */
+ VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
if (real_prev == next) {
- /*
- * There's nothing to do: we always keep the per-mm control
- * regs in sync with cpu_tlbstate.loaded_mm. Just
- * sanity-check mm_cpumask.
- */
- if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
- cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
- }
+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+ next->context.ctx_id);
+
+ if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * There's nothing to do: we weren't lazy, and we
+ * aren't changing our mm. We don't need to flush
+ * anything, nor do we need to update CR3, CR4, or
+ * LDTR.
+ */
+ return;
+ }
+
+ /* Resume remote flushes and then read tlb_gen. */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
+ /*
+ * Ideally, we'd have a flush_tlb() variant that
+ * takes the known CR3 value as input. This would
+ * be faster on Xen PV and on hypothetical CPUs
+ * on which INVPCID is fast.
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+ next_tlb_gen);
+ write_cr3(__pa(next->pgd));
+
+ /*
+ * This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we have to call the tracepoint
+ * specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+ TLB_FLUSH_ALL);
+ }
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
+ * We just exited lazy mode, which means that CR4 and/or LDTR
+ * may be stale. (Changes to the required CR4 and LDTR states
+ * are not reflected in tlb_gen.)
*/
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
- pgd_t *pgd = next->pgd + stack_pgd_index;
-
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
+ } else {
+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
+ next->context.ctx_id);
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int index = pgd_index(current_stack_pointer());
+ pgd_t *pgd = next->pgd + index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[index]);
+ }
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
+ /* Stop remote flushes for the previous mm */
+ if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
- WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- */
- load_cr3(next->pgd);
+ /*
+ * Start remote flushes and then read tlb_gen.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- /*
- * This gets called via leave_mm() in the idle path where RCU
- * functions differently. Tracing normally uses RCU, so we have to
- * call the tracepoint specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ write_cr3(__pa(next->pgd));
- /* Stop flush ipis for the previous mm */
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /*
+ * This gets called via leave_mm() in the idle path where RCU
+ * functions differently. Tracing normally uses RCU, so we
+ * have to call the tracepoint specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+ TLB_FLUSH_ALL);
+ }
- /* Load per-mm CR4 and LDTR state */
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
loaded_mm->context.ctx_id);
- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
/*
- * leave_mm() is adequate to handle any type of flush, and
- * we would prefer not to receive further IPIs. leave_mm()
- * clears this CPU's bit in mm_cpumask().
+ * We're in lazy mode -- don't flush. We can get here on
+ * remote flushes due to races and on local flushes if a
+ * kernel thread coincidentally flushes the mm it's lazily
+ * still using.
*/
- leave_mm(smp_processor_id());
return;
}
@@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
+ trace_tlb_flush(reason, 0);
return;
}
@@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
+ /*
+ * This whole special case is confused. UV has a "Broadcast
+ * Assist Unit", which seems to be a fancy way to send IPIs.
+ * Back when x86 used an explicit TLB flush IPI, UV was
+ * optimized to use its own mechanism. These days, x86 uses
+ * smp_call_function_many(), but UV still uses a manual IPI,
+ * and that IPI's action is out of date -- it does a manual
+ * flush instead of calling flush_tlb_func_remote(). This
+ * means that the percpu tlb_gen variables won't be updated
+ * and we'll do pointless flushes on future context switches.
+ *
+ * Rather than hooking native_flush_tlb_others() here, I think
+ * that UV should be updated so that smp_call_function_many(),
+ * etc, are optimal on UV.
+ */
unsigned int cpu;
cpu = smp_processor_id();
@@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info);
+
put_cpu();
}
@@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
@@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info);
+
cpumask_clear(&batch->cpumask);
put_cpu();
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 5f61b7e2e6b2..ba76f3ce997f 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
- && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
}
return;
}
- cpumask_copy(mask, mm_cpumask(mm));
/*
* It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
* look at its actual current cr3 value, and force it to flush
* if needed.
*/
+ cpumask_clear(mask);
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask);
--
2.14.2

View File

@ -0,0 +1,340 @@
From e3c7bff633fc1210c6b19dd3ebcafb9f6716d586 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 24 Jul 2017 21:41:38 -0700
Subject: [PATCH 042/231] x86/mm: Implement PCID based optimization: try to
preserve old TLB entries using PCID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(backported from commit 10af6235e0d327d42e1bad974385197817923dc1)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu_context.h | 3 ++
arch/x86/include/asm/processor-flags.h | 2 +
arch/x86/include/asm/tlbflush.h | 18 +++++++-
arch/x86/mm/init.c | 1 +
arch/x86/mm/tlb.c | 84 +++++++++++++++++++++++++---------
5 files changed, 85 insertions(+), 23 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d6b055b328f2..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..791b60199aa4 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -35,6 +35,7 @@
/* Mask off the address space ID bits. */
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
#define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH (1UL << 63)
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -42,6 +43,7 @@
*/
#define CR3_ADDR_MASK 0xFFFFFFFFull
#define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6397275008db..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -95,6 +101,8 @@ struct tlb_state {
* mode even if we've already switched back to swapper_pg_dir.
*/
struct mm_struct *loaded_mm;
+ u16 loaded_mm_asid;
+ u16 next_asid;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -104,7 +112,8 @@ struct tlb_state {
/*
* This is a list of all contexts that might exist in the TLB.
- * Since we don't yet use PCID, there is only one context.
+ * There is one per ASID that we use, and the ASID (what the
+ * CPU calls PCID) is the index into ctxts.
*
* For each context, ctx_id indicates which mm the TLB's user
* entries came from. As an invariant, the TLB will never
@@ -114,8 +123,13 @@ struct tlb_state {
* To be clear, this means that it's legal for the TLB code to
* flush the TLB without updating tlb_gen. This can happen
* (for now, at least) due to paravirt remote flushes.
+ *
+ * NB: context 0 is a bit special, since it's also used by
+ * various bits of init code. This is fine -- code that
+ * isn't aware of PCID will end up harmlessly flushing
+ * context 0.
*/
- struct tlb_context ctxs[1];
+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c86dc071bb10..af5c1ed21d43 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -849,6 +849,7 @@ void __init zone_sizes_init(void)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
+ .next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0982c997d36f..57943b4d8f2e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,40 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ u16 *new_asid, bool *need_flush)
+{
+ u16 asid;
+
+ if (!static_cpu_has(X86_FEATURE_PCID)) {
+ *new_asid = 0;
+ *need_flush = true;
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+ next->context.ctx_id)
+ continue;
+
+ *new_asid = asid;
+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+ next_tlb_gen);
+ return;
+ }
+
+ /*
+ * We don't currently own an ASID slot on this CPU.
+ * Allocate a slot.
+ */
+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+ if (*new_asid >= TLB_NR_DYN_ASIDS) {
+ *new_asid = 0;
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ }
+ *need_flush = true;
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
@@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* Verify that CR3 is what we think it is. This will catch
* hypothetical buggy code that directly switches to swapper_pg_dir
- * without going through leave_mm() / switch_mm_irqs_off().
+ * without going through leave_mm() / switch_mm_irqs_off() or that
+ * does something like write_cr3(read_cr3_pa()).
*/
- VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
+ VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
if (real_prev == next) {
- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
@@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+ next_tlb_gen) {
/*
* Ideally, we'd have a flush_tlb() variant that
* takes the known CR3 value as input. This would
* be faster on Xen PV and on hypothetical CPUs
* on which INVPCID is fast.
*/
- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
next_tlb_gen);
- write_cr3(__pa(next->pgd));
+ write_cr3(__pa(next->pgd) | prev_asid);
/*
* This gets called via leave_mm() in the idle path
@@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* are not reflected in tlb_gen.)
*/
} else {
- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
- next->context.ctx_id);
+ u16 new_asid;
+ bool need_flush;
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- write_cr3(__pa(next->pgd));
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- /*
- * This gets called via leave_mm() in the idle path where RCU
- * functions differently. Tracing normally uses RCU, so we
- * have to call the tracepoint specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ write_cr3(__pa(next->pgd) | new_asid);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
load_mm_cr4(next);
@@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* wants us to catch up to.
*/
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
@@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
}
/* Both paths above update our state to mm_tlb_gen. */
- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
--
2.14.2

View File

@ -0,0 +1,176 @@
From ddb5e7b381d37d0f8bca61f0b761ae5c3a2f5ee0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:48 -0700
Subject: [PATCH 043/231] x86/mm: Factor out CR3-building code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Current, the code that assembles a value to load into CR3 is
open-coded everywhere. Factor it out into helpers build_cr3() and
build_cr3_noflush().
This makes one semantic change: __get_current_cr3_fast() was wrong
on SME systems. No one noticed because the only caller is in the
VMX code, and there are no CPUs with both SME and VMX.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(backported from commit 47061a24e2ee5bd8a40d473d47a5bd823fa0081f)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 72be211bac7be521f128d419d63cae38ba60ace8)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu_context.h | 15 ++++++---
arch/x86/mm/tlb.c | 68 +++++++++++++++++++++++++++++++++++---
2 files changed, 75 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 7ae318c340d9..a999ba6b721f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
+{
+ return __sme_pa(mm->pgd) | asid;
+}
+
+static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
+{
+ return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH;
+}
/*
* This can be used from process context to figure out what the value of
@@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
*/
static inline unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
-
- if (static_cpu_has(X86_FEATURE_PCID))
- cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 57943b4d8f2e..440400316c8a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -123,7 +123,23 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* without going through leave_mm() / switch_mm_irqs_off() or that
* does something like write_cr3(read_cr3_pa()).
*/
- VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ /*
+ * If we were to BUG here, we'd be very likely to kill
+ * the system so hard that we don't see the call trace.
+ * Try to recover instead by ignoring the error and doing
+ * a global flush to minimize the chance of corruption.
+ *
+ * (This is far from being a fully correct recovery.
+ * Architecturally, the CPU could prefetch something
+ * back into an incorrect ASID slot and leave it there
+ * to cause trouble down the road. It's better than
+ * nothing, though.)
+ */
+ __flush_tlb_all();
+ }
+#endif
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
@@ -153,7 +169,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
next_tlb_gen);
- write_cr3(__pa(next->pgd) | prev_asid);
+ write_cr3(build_cr3(next, prev_asid));
/*
* This gets called via leave_mm() in the idle path
@@ -204,12 +220,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(__pa(next->pgd) | new_asid);
+ write_cr3(build_cr3(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+ write_cr3(build_cr3_noflush(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -221,6 +237,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
switch_ldt(real_prev, next);
}
+/*
+ * Call this when reinitializing a CPU. It fixes the following potential
+ * problems:
+ *
+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely
+ * because the CPU was taken down and came back up with CR3's PCID
+ * bits clear. CPU hotplug can do this.
+ *
+ * - The TLB contains junk in slots corresponding to inactive ASIDs.
+ *
+ * - The CPU went so far out to lunch that it may have missed a TLB
+ * flush.
+ */
+void initialize_tlbstate_and_flush(void)
+{
+ int i;
+ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long cr3 = __read_cr3();
+
+ /* Assert that CR3 already references the right mm. */
+ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+
+ /*
+ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
+ * doesn't work like other CR4 bits because it can only be set from
+ * long mode.)
+ */
+ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
+ !(cr4_read_shadow() & X86_CR4_PCIDE));
+
+ /* Force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm, 0));
+
+ /* Reinitialize tlbstate. */
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+
+ for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
+ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
+}
+
/*
* flush_tlb_func_common()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
--
2.14.2

View File

@ -0,0 +1,86 @@
From b5143e55b3bf018b3ad2598e677ceb5e155eba6f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:49 -0700
Subject: [PATCH 044/231] x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware
code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Putting the logical ASID into CR3's PCID bits directly means that we
have two cases to consider separately: ASID == 0 and ASID != 0.
This means that bugs that only hit in one of these cases trigger
nondeterministically.
There were some bugs like this in the past, and I think there's
still one in current kernels. In particular, we have a number of
ASID-unware code paths that save CR3, write some special value, and
then restore CR3. This includes suspend/resume, hibernate, kexec,
EFI, and maybe other things I've missed. This is currently
dangerous: if ASID != 0, then this code sequence will leave garbage
in the TLB tagged for ASID 0. We could potentially see corruption
when switching back to ASID 0. In principle, an
initialize_tlbstate_and_flush() call after these sequences would
solve the problem, but EFI, at least, does not call this. (And it
probably shouldn't -- initialize_tlbstate_and_flush() is rather
expensive.)
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 52a2af400c1075219b3f0ce5c96fc961da44018a)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 15e474753e66e44da1365049f465427053a453ba)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index a999ba6b721f..c120b5db178a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+/*
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
+ * bits. This serves two purposes. It prevents a nasty situation in
+ * which PCID-unaware code saves CR3, loads some other value (with PCID
+ * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
+ * the saved ASID was nonzero. It also means that any bugs involving
+ * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
+ * deterministically.
+ */
+
static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
{
- return __sme_pa(mm->pgd) | asid;
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ VM_WARN_ON_ONCE(asid > 4094);
+ return __sme_pa(mm->pgd) | (asid + 1);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(mm->pgd);
+ }
}
static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
{
- return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH;
+ VM_WARN_ON_ONCE(asid > 4094);
+ return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
}
/*
--
2.14.2

View File

@ -0,0 +1,401 @@
From d1ffadc67e2eee2d5f8626dca6646e70e3aa9d76 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 9 Oct 2017 09:50:49 -0700
Subject: [PATCH 045/231] x86/mm: Flush more aggressively in lazy TLB mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(backported from commit b956575bed91ecfb136a8300742ecbbf451471ab)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a4bb9409c548ece51ec246fc5113a32b8d130142)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/mmu_context.h | 8 +-
arch/x86/include/asm/tlbflush.h | 24 ++++++
arch/x86/mm/tlb.c | 160 +++++++++++++++++++++++++------------
3 files changed, 136 insertions(+), 56 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
DEBUG_LOCKS_WARN_ON(preemptible());
}
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
- int cpu = smp_processor_id();
-
- if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
- cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d23e61dc0640..6533da3036c9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+/*
+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
+ * to init_mm when we switch to a kernel thread (e.g. the idle thread). If
+ * it's false, then we immediately switch CR3 when entering a kernel thread.
+ */
+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
@@ -104,6 +111,23 @@ struct tlb_state {
u16 loaded_mm_asid;
u16 next_asid;
+ /*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 440400316c8a..b27aceaf7ed1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -140,52 +142,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
- if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * There's nothing to do: we weren't lazy, and we
- * aren't changing our mm. We don't need to flush
- * anything, nor do we need to update CR3, CR4, or
- * LDTR.
- */
- return;
- }
-
- /* Resume remote flushes and then read tlb_gen. */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
- next_tlb_gen) {
- /*
- * Ideally, we'd have a flush_tlb() variant that
- * takes the known CR3 value as input. This would
- * be faster on Xen PV and on hypothetical CPUs
- * on which INVPCID is fast.
- */
- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
- next_tlb_gen);
- write_cr3(build_cr3(next, prev_asid));
-
- /*
- * This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we have to call the tracepoint
- * specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
- }
-
/*
- * We just exited lazy mode, which means that CR4 and/or LDTR
- * may be stale. (Changes to the required CR4 and LDTR states
- * are not reflected in tlb_gen.)
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ return;
} else {
u16 new_asid;
bool need_flush;
@@ -204,10 +178,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/* Stop remote flushes for the previous mm */
- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
@@ -237,6 +210,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
switch_ldt(real_prev, next);
}
+/*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
+
+ if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
+}
+
/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
@@ -308,16 +312,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
- * We're in lazy mode -- don't flush. We can get here on
- * remote flushes due to races and on local flushes if a
- * kernel thread coincidentally flushes the mm it's lazily
- * still using.
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
*/
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
@@ -616,3 +624,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
+
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[2];
+
+ buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+ buf[1] = '\n';
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t tlblazy_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ bool val;
+
+ if (kstrtobool_from_user(user_buf, count, &val))
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&tlb_use_lazy_mode);
+ else
+ static_branch_disable(&tlb_use_lazy_mode);
+
+ return count;
+}
+
+static const struct file_operations fops_tlblazy = {
+ .read = tlblazy_read_file,
+ .write = tlblazy_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init init_tlb_use_lazy_mode(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * Heuristic: with PCID on, switching to and from
+ * init_mm is reasonably fast, but remote flush IPIs
+ * as expensive as ever, so turn off lazy TLB mode.
+ *
+ * We can't do this in setup_pcid() because static keys
+ * haven't been initialized yet, and it would blow up
+ * badly.
+ */
+ static_branch_disable(&tlb_use_lazy_mode);
+ }
+
+ debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlblazy);
+ return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);
--
2.14.2

View File

@ -0,0 +1,101 @@
From 1ef06119163f106fc0de4990e7ae559e9a5a8169 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 4 Nov 2017 04:16:12 -0700
Subject: [PATCH 046/231] Revert "x86/mm: Stop calling leave_mm() in idle code"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5.
The reason I removed the leave_mm() calls in question is because the
heuristic wasn't needed after that patch. With the original version
of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running
kernel thread) due a flush on the loaded mm.
Unfortunately, that caused architectural issues, so now I've
reinstated these flushes on non-PCID systems in:
commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode").
That, in turn, gives us a power management and occasionally
performance regression as compared to old kernels: a process that
goes into a deep idle state on a given CPU and gets its mm flushed
due to activity on a different CPU will wake the idle CPU.
Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an
intel_idle state that is likely to cause a TLB flush gets its mm
switched to init_mm before going idle.
FWIW, this heuristic is lousy. Whether we should change CR3 before
idle isn't a good hint except insofar as the performance hit is a bit
lower if the TLB is getting flushed by the idle code anyway. What we
really want to know is whether we anticipate being idle long enough
that the mm is likely to be flushed before we wake up. This is more a
matter of the expected latency than the idle state that gets chosen.
This heuristic also completely fails on systems that don't know
whether the TLB will be flushed (e.g. AMD systems?). OTOH it may be a
bit obsolete anyway -- PCID systems don't presently benefit from this
heuristic at all.
We also shouldn't do this callback from innermost bit of the idle code
due to the RCU nastiness it causes. All the information need is
available before rcu_idle_enter() needs to happen.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code"
Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 675357362aeba19688440eb1aaa7991067f73b12)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b607843145fd0593fcd87e2596d1dc5a1d5f79a5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/mm/tlb.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index b27aceaf7ed1..ed06f1593390 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -194,12 +194,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
this_cpu_write(cpu_tlbstate.loaded_mm, next);
--
2.14.2

View File

@ -0,0 +1,86 @@
From f2c1440e8f0b728d48ee8ce295f4dfe495949e1f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Oct 2017 08:51:43 -0500
Subject: [PATCH 047/231] kprobes/x86: Set up frame pointer in kprobe
trampoline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Richard Weinberger saw an unwinder warning when running bcc's opensnoop:
WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008
unwind stack type:0 next_sp: (null) mask:0x2 graph_idx:0
...
ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0)
ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90)
...
A lockdep stack trace was initiated from inside a kprobe handler, when
the unwinder noticed a bad frame pointer on the stack. The bad frame
pointer is related to the fact that the kprobe optprobe trampoline
doesn't save the frame pointer before calling into optimized_callback().
Reported-and-tested-by: Richard Weinberger <richard@sigma-star.at>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit ee213fc72fd67d0988525af501534f4cb924d1e9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 0f7d5518c91335584b16c7bed1c54c10b78ea76a)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/kprobes/common.h | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
/* Kprobes and Optprobes common header */
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
+ " mov %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
+#endif
+
#ifdef CONFIG_X86_64
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax. */ \
@@ -17,7 +26,7 @@
" pushq %r10\n" \
" pushq %r11\n" \
" pushq %rbx\n" \
- " pushq %rbp\n" \
+ SAVE_RBP_STRING \
" pushq %r12\n" \
" pushq %r13\n" \
" pushq %r14\n" \
@@ -48,7 +57,7 @@
" pushl %es\n" \
" pushl %ds\n" \
" pushl %eax\n" \
- " pushl %ebp\n" \
+ SAVE_RBP_STRING \
" pushl %edi\n" \
" pushl %esi\n" \
" pushl %edx\n" \
--
2.14.2

View File

@ -0,0 +1,140 @@
From 179faefa769caa263bc88b1f7292be7a60df4298 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Aug 2017 08:47:21 +0200
Subject: [PATCH 048/231] x86/tracing: Introduce a static key for exception
tracing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Switching the IDT just for avoiding tracepoints creates a completely
impenetrable macro/inline/ifdef mess.
There is no point in avoiding tracepoints for most of the traps/exceptions.
For the more expensive tracepoints, like pagefaults, this can be handled with
an explicit static key.
Preparatory patch to remove the tracing IDT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170828064956.593094539@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2feb1b316d48004d905278c02a55902cab0be8be)
Signed-off-by: Andy Whitcroft <apw@kathleen.maas>
(cherry picked from commit 15e0ff2a63fdd93f8881e2ebba5c048c5b601e57)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d58a56e851c339d8d9d311dc9b4fad6abbf8bf19)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/trace/common.h | 15 +++++++++++++++
arch/x86/include/asm/trace/exceptions.h | 4 +---
arch/x86/include/asm/trace/irq_vectors.h | 4 +---
arch/x86/kernel/tracepoint.c | 9 ++++++++-
4 files changed, 25 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/include/asm/trace/common.h
diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h
new file mode 100644
index 000000000000..b1eb7b18ee8a
--- /dev/null
+++ b/arch/x86/include/asm/trace/common.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_TRACE_COMMON_H
+#define _ASM_TRACE_COMMON_H
+
+extern int trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+#ifdef CONFIG_TRACING
+DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key);
+#define trace_irqvectors_enabled() \
+ static_branch_unlikely(&trace_irqvectors_key)
+#else
+static inline bool trace_irqvectors_enabled(void) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2422b14c50a7..960a5b50ac3b 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -5,9 +5,7 @@
#define _TRACE_PAGE_FAULT_H
#include <linux/tracepoint.h>
-
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#include <asm/trace/common.h>
DECLARE_EVENT_CLASS(x86_exceptions,
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 32dd6a9e343c..7825b4426e7e 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -5,9 +5,7 @@
#define _TRACE_IRQ_VECTORS_H
#include <linux/tracepoint.h>
-
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#include <asm/trace/common.h>
DECLARE_EVENT_CLASS(x86_irq_vector,
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 15515132bf0d..dd4aa04bb95c 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -4,9 +4,11 @@
* Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
*
*/
+#include <linux/jump_label.h>
+#include <linux/atomic.h>
+
#include <asm/hw_irq.h>
#include <asm/desc.h>
-#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
@@ -15,6 +17,7 @@ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
/* No need to be aligned, but done to keep all IDTs defined the same way. */
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+DEFINE_STATIC_KEY_FALSE(trace_irqvectors_key);
static int trace_irq_vector_refcount;
static DEFINE_MUTEX(irq_vector_mutex);
@@ -36,6 +39,8 @@ static void switch_idt(void *arg)
int trace_irq_vector_regfunc(void)
{
+ static_branch_inc(&trace_irqvectors_key);
+
mutex_lock(&irq_vector_mutex);
if (!trace_irq_vector_refcount) {
set_trace_idt_ctr(1);
@@ -49,6 +54,8 @@ int trace_irq_vector_regfunc(void)
void trace_irq_vector_unregfunc(void)
{
+ static_branch_dec(&trace_irqvectors_key);
+
mutex_lock(&irq_vector_mutex);
trace_irq_vector_refcount--;
if (!trace_irq_vector_refcount) {
--
2.14.2

View File

@ -0,0 +1,189 @@
From 0b7f51014f5219ece1ca55662495bd036f3bd00d Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:33 -0500
Subject: [PATCH 049/231] x86/boot: Add early cmdline parsing for options with
arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Add a cmdline_find_option() function to look for cmdline options that
take arguments. The argument is returned in a supplied buffer and the
argument length (regardless of whether it fits in the supplied buffer)
is returned, with -1 indicating not found.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit e505371dd83963caae1a37ead9524e8d997341be)
Signed-off-by: Andy Whitcroft <apw@kathleen.maas>
(cherry picked from commit 37569cd003aa69a57d5666530436c2d973a57b26)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b9f03418aa9b8ecbb1c7f32ac2bfe68fd21de4f5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/cmdline.h | 2 +
arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
#define _ASM_X86_CMDLINE_H
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
return 0; /* Buffer overrun */
}
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
--
2.14.2

View File

@ -0,0 +1,192 @@
From 7c5d42f31bf68647dd00ac2fef9057d113e8072d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sat, 9 Sep 2017 00:56:03 +0300
Subject: [PATCH 050/231] mm, x86/mm: Fix performance regression in
get_user_pages_fast()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The 0-day test bot found a performance regression that was tracked down to
switching x86 to the generic get_user_pages_fast() implementation:
http://lkml.kernel.org/r/20170710024020.GA26389@yexl-desktop
The regression was caused by the fact that we now use local_irq_save() +
local_irq_restore() in get_user_pages_fast() to disable interrupts.
In x86 implementation local_irq_disable() + local_irq_enable() was used.
The fix is to make get_user_pages_fast() use local_irq_disable(),
leaving local_irq_save() for __get_user_pages_fast() that can be called
with interrupts disabled.
Numbers for pinning a gigabyte of memory, one page a time, 20 repeats:
Before: Average: 14.91 ms, stddev: 0.45 ms
After: Average: 10.76 ms, stddev: 0.18 ms
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: linux-mm@kvack.org
Fixes: e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation")
Link: http://lkml.kernel.org/r/20170908215603.9189-3-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 5b65c4677a57a1d4414212f9995aa0e46a21ff80)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 5241f4b2c68284612e34910305f3234e4a64701b)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
mm/gup.c | 97 ++++++++++++++++++++++++++++++++++++++--------------------------
1 file changed, 58 insertions(+), 39 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..4a789f1c6a27 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1618,6 +1618,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
+static void gup_pgd_range(unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pgd_t *pgdp;
+
+ pgdp = pgd_offset(current->mm, addr);
+ do {
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ return;
+ if (unlikely(pgd_huge(pgd))) {
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ pages, nr))
+ return;
+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
+ PGDIR_SHIFT, next, write, pages, nr))
+ return;
+ } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ return;
+ } while (pgdp++, addr = next, addr != end);
+}
+
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP. It will only return non-negative values.
@@ -1625,10 +1666,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
- unsigned long next, flags;
- pgd_t *pgdp;
+ unsigned long flags;
int nr = 0;
start &= PAGE_MASK;
@@ -1652,45 +1691,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* block IPIs that come from THPs splitting.
*/
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = READ_ONCE(*pgdp);
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
- pages, &nr))
- break;
- } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
- if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, &nr))
- break;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ local_irq_save(flags);
+ gup_pgd_range(addr, end, write, pages, &nr);
+ local_irq_restore(flags);
+ }
return nr;
}
-#ifndef gup_fast_permitted
-/*
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
- * we need to fall back to the slow version:
- */
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
-{
- unsigned long len, end;
-
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- return end >= start;
-}
-#endif
-
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -1710,12 +1719,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
+ unsigned long addr, len, end;
int nr = 0, ret = 0;
start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ (void __user *)start, len)))
+ return 0;
if (gup_fast_permitted(start, nr_pages, write)) {
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ local_irq_disable();
+ gup_pgd_range(addr, end, write, pages, &nr);
+ local_irq_enable();
ret = nr;
}
--
2.14.2

View File

@ -0,0 +1,149 @@
From 7d7ea8398b5f0cf22b8faec46c95543031c5fe94 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 6 Sep 2017 17:18:08 +0200
Subject: [PATCH 051/231] x86/asm: Remove unnecessary \n\t in front of CC_SET()
from asm templates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
There is no need for \n\t in front of CC_SET(), as the macro already includes these two.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(backported from commit 3c52b5c64326d9dcfee4e10611c53ec1b1b20675)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 1c3f29ec5586e3aecfde2c6f83b8786e1aecd9ac)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/archrandom.h | 8 ++++----
arch/x86/include/asm/bitops.h | 10 +++++-----
arch/x86/include/asm/percpu.h | 2 +-
arch/x86/include/asm/rmwcc.h | 2 +-
4 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 5b0579abb398..3ac991d81e74 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v)
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
- asm volatile(RDRAND_LONG "\n\t"
+ asm volatile(RDRAND_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v)
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
do {
- asm volatile(RDRAND_INT "\n\t"
+ asm volatile(RDRAND_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
if (ok)
@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v)
static inline bool rdseed_long(unsigned long *v)
{
bool ok;
- asm volatile(RDSEED_LONG "\n\t"
+ asm volatile(RDSEED_LONG
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v)
static inline bool rdseed_int(unsigned int *v)
{
bool ok;
- asm volatile(RDSEED_INT "\n\t"
+ asm volatile(RDSEED_INT
CC_SET(c)
: CC_OUT(c) (ok), "=a" (*v));
return ok;
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 854022772c5b..8cee8db6dffb 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
bool negative;
- asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
+ asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), ADDR
: "ir" ((char) ~(1 << nr)) : "memory");
@@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
{
bool oldbit;
- asm("bts %2,%1\n\t"
+ asm("bts %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
{
bool oldbit;
- asm volatile("btr %2,%1\n\t"
+ asm volatile("btr %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr));
@@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
{
bool oldbit;
- asm volatile("btc %2,%1\n\t"
+ asm volatile("btc %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit), ADDR
: "Ir" (nr) : "memory");
@@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
{
bool oldbit;
- asm volatile("bt %2,%1\n\t"
+ asm volatile("bt %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 9fa03604b2b3..b21a475fd7ed 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
{
bool oldbit;
- asm volatile("bt "__percpu_arg(2)",%1\n\t"
+ asm volatile("bt "__percpu_arg(2)",%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..dd7ba5aa8dca 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -28,7 +28,7 @@ cc_label: \
#define __GEN_RMWcc(fullop, var, cc, ...) \
do { \
bool c; \
- asm volatile (fullop ";" CC_SET(cc) \
+ asm volatile (fullop CC_SET(cc) \
: "+m" (var), CC_OUT(cc) (c) \
: __VA_ARGS__ : "memory"); \
return c; \
--
2.14.2

View File

@ -0,0 +1,58 @@
From 985d8e62ef5f1b006da5e175858e552c0dbda771 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:30 -0500
Subject: [PATCH 052/231] objtool: Don't report end of section error after an
empty unwind hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the
section ends unexpectedly. This can happen with the xen-head.S code
because the hypercall_page is "text" but it's all zeros.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 00d96180dc38ef872ac471c2d3e14b067cbd895d)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 9d22f903bba24f2ac86de8a81dc1788f9957aca8)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
tools/objtool/check.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 368275de5f23..0a86fd0ac082 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1652,11 +1652,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
if (insn->dead_end)
return 0;
- insn = next_insn;
- if (!insn) {
+ if (!next_insn) {
+ if (state.cfa.base == CFI_UNDEFINED)
+ return 0;
WARN("%s: unexpected end of section", sec->name);
return 1;
}
+
+ insn = next_insn;
}
return 0;
--
2.14.2

View File

@ -0,0 +1,54 @@
From 109bbd8c905806e929b67ca0b2eaf57ff88f10c1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:31 -0500
Subject: [PATCH 053/231] x86/head: Remove confusing comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This comment is actively wrong and confusing. It refers to the
registers' stack offsets after the pt_regs has been constructed on the
stack, but this code is *before* that.
At this point the stack just has the standard iret frame, for which no
comment should be needed.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 17270717e80de33a884ad328fea5f407d87f6d6a)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 49187e0108184688304260a75d29b789f36f3a2b)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/head_64.S | 4 ----
1 file changed, 4 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..627c798b2f15 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -254,10 +254,6 @@ bad_address:
__INIT
ENTRY(early_idt_handler_array)
- # 104(%rsp) %rflags
- # 96(%rsp) %cs
- # 88(%rsp) %rip
- # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
--
2.14.2

View File

@ -0,0 +1,48 @@
From 5ba2d2eca16a62a64166661ea849c4916ae2f44f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:32 -0500
Subject: [PATCH 054/231] x86/head: Remove unused 'bad_address' code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
It's no longer possible for this code to be executed, so remove it.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit a8b88e84d124bc92c4808e72b8b8c0e0bb538630)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d790ff35a3a49ef0942a3484f024551433fd2ddf)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/head_64.S | 3 ---
1 file changed, 3 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 627c798b2f15..37d9905d38d6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -249,9 +249,6 @@ ENDPROC(start_cpu0)
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
-bad_address:
- jmp bad_address
-
__INIT
ENTRY(early_idt_handler_array)
i = 0
--
2.14.2

View File

@ -0,0 +1,66 @@
From 2527d40adb84012c90cab350bd5ebbce65daaff7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:33 -0500
Subject: [PATCH 055/231] x86/head: Fix head ELF function annotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
These functions aren't callable C-type functions, so don't annotate them
as such.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 015a2ea5478680fc5216d56b7ff306f2a74efaf9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 707517a56928fed1c03eefdb4e00fa57dfddc4fd)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/head_64.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 37d9905d38d6..45b18b1a6417 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -218,7 +218,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
#include "verify_cpu.S"
@@ -261,7 +261,7 @@ ENTRY(early_idt_handler_array)
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+END(early_idt_handler_array)
early_idt_handler_common:
/*
@@ -304,7 +304,7 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+END(early_idt_handler_common)
__INITDATA
--
2.14.2

View File

@ -0,0 +1,53 @@
From 1b9783c7335f17e3f5bdb8776dd06de62dcfba81 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:34 -0500
Subject: [PATCH 056/231] x86/boot: Annotate verify_cpu() as a callable
function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
verify_cpu() is a callable function. Annotate it as such.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit e93db75a0054b23a874a12c63376753544f3fe9e)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 48a432c46026f864e194cdf9a8133e7c9109274e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/verify_cpu.S | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 014ea59aa153..3d3c2f71f617 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -33,7 +33,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-verify_cpu:
+ENTRY(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@@ -139,3 +139,4 @@ verify_cpu:
popf # Restore caller passed flags
xorl %eax, %eax
ret
+ENDPROC(verify_cpu)
--
2.14.2

View File

@ -0,0 +1,56 @@
From 6f359bcacdf28ca9f6bfc29bd0aa4e22489eb34d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:35 -0500
Subject: [PATCH 057/231] x86/xen: Fix xen head ELF annotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Mark the ends of the startup_xen and hypercall_page code sections.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2582d3df95c76d3b686453baf90b64d57e87d1e8)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b9410861f1436c1e38958a9b85009ad252aad9f5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/xen-head.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 72a8e6adebe6..2f0cff2cc265 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -33,7 +33,7 @@ ENTRY(startup_xen)
mov $init_thread_union+THREAD_SIZE, %_ASM_SP
jmp xen_start_kernel
-
+END(startup_xen)
__FINIT
#endif
@@ -47,7 +47,7 @@ ENTRY(hypercall_page)
.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
#include <asm/xen-hypercalls.h>
#undef HYPERCALL
-
+END(hypercall_page)
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
--
2.14.2

View File

@ -0,0 +1,70 @@
From b90136e442c889a7344992acc34764729936ab92 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:36 -0500
Subject: [PATCH 058/231] x86/xen: Add unwind hint annotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Add unwind hint annotations to the xen head code so the ORC unwinder can
read head_64.o.
hypercall_page needs empty annotations at 32-byte intervals to match the
'xen_hypercall_*' ELF functions at those locations.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit abbe1cac6214d81d2f4e149aba64a8760703144e)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 9f099a90cb39eaff9b3187e8a6d8151c8af53db1)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/xen-head.S | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 2f0cff2cc265..ad189ab2c329 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -9,6 +9,7 @@
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page_types.h>
+#include <asm/unwind_hints.h>
#include <xen/interface/elfnote.h>
#include <xen/interface/features.h>
@@ -19,6 +20,7 @@
#ifdef CONFIG_XEN_PV
__INIT
ENTRY(startup_xen)
+ UNWIND_HINT_EMPTY
cld
/* Clear .bss */
@@ -40,7 +42,10 @@ END(startup_xen)
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
- .skip PAGE_SIZE
+ .rept (PAGE_SIZE / 32)
+ UNWIND_HINT_EMPTY
+ .skip 32
+ .endr
#define HYPERCALL(n) \
.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
--
2.14.2

View File

@ -0,0 +1,134 @@
From 6ef121f444bab6ac294e1eda62eb727ee639c6d7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Sep 2017 21:43:37 -0500
Subject: [PATCH 059/231] x86/head: Add unwind hint annotations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Jiri Slaby reported an ORC issue when unwinding from an idle task. The
stack was:
ffffffff811083c2 do_idle+0x142/0x1e0
ffffffff8110861d cpu_startup_entry+0x5d/0x60
ffffffff82715f58 start_kernel+0x3ff/0x407
ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d
ffffffff810001bf secondary_startup_64+0x9f/0xa0
The ORC unwinder errored out at secondary_startup_64 because the head
code isn't annotated yet so there wasn't a corresponding ORC entry.
Fix that and any other head-related unwinding issues by adding unwind
hints to the head code.
Reported-by: Jiri Slaby <jslaby@suse.cz>
Tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 2704fbb672d0d9a19414907fda7949283dcef6a1)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b63a868e404e64172afefea553c6a40963a151db)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/Makefile | 1 -
arch/x86/kernel/head_64.S | 14 ++++++++++++--
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 287eac7d207f..e2315aecc441 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 45b18b1a6417..d081bc7a027d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@@ -81,6 +82,7 @@ startup_64:
movq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
@@ -116,6 +118,7 @@ ENTRY(secondary_startup_64)
movq $1f, %rax
jmp *%rax
1:
+ UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
@@ -230,6 +233,7 @@ END(secondary_startup_64)
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
@@ -254,13 +258,18 @@ ENTRY(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
- pushq $0 # Dummy error code, to make stack frame uniform
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
+ UNWIND_HINT_IRET_REGS offset=16
END(early_idt_handler_array)
early_idt_handler_common:
@@ -289,6 +298,7 @@ early_idt_handler_common:
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
@@ -411,7 +421,7 @@ ENTRY(phys_base)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
-
+
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
--
2.14.2

View File

@ -0,0 +1,43 @@
From 012bd636105426b93026d594261663e8a728dcc1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Mon, 25 Sep 2017 02:06:19 -0600
Subject: [PATCH 060/231] ACPI / APEI: adjust a local variable type in
ghes_ioremap_pfn_irq()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Match up with what 7edda0886b ("acpi: apei: handle SEA notification
type for ARMv8") did for ghes_ioremap_pfn_nmi().
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit 095f613c6b386a1704b73a549e9ba66c1d5381ae)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 0a5c092882b0ead111dc3a6bbaa870665b54d796)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/acpi/apei/ghes.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d661d452b238..3628078ee351 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -174,7 +174,8 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
{
- unsigned long vaddr, paddr;
+ unsigned long vaddr;
+ phys_addr_t paddr;
pgprot_t prot;
vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
--
2.14.2

View File

@ -0,0 +1,44 @@
From a405dd2b4172c310101f96c2152598bc24e9e6f8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 12 Oct 2017 09:24:30 +0200
Subject: [PATCH 061/231] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default
in the 64-bit defconfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Increase testing coverage by turning on the primary x86 unwinder for
the 64-bit defconfig.
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 1e4078f0bba46ad61b69548abe6a6faf63b89380)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit ebcba768c005dce435721f6c998e3afdf5534666)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/configs/x86_64_defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4a4b16e56d35..eb65c248708d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
+CONFIG_ORC_UNWINDER=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
--
2.14.2

View File

@ -0,0 +1,66 @@
From bc21c74b922871588bf6626bff34fa084ed60d71 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 12 Oct 2017 18:06:19 -0400
Subject: [PATCH 062/231] x86/fpu/debug: Remove unused 'x86_fpu_state' and
'x86_fpu_deactivate_state' tracepoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Commit:
d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points")
... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points,
but never used them. Today they are still not used. As they take up
and waste memory, remove them.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit c7c367ddb6ffb6af2cfee287960e97c4aefc6548)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/trace/fpu.h | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 342e59789fcd..fed7d9ecae60 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -36,11 +36,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_state,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
@@ -76,11 +71,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
TP_ARGS(fpu)
);
-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
- TP_PROTO(struct fpu *fpu),
- TP_ARGS(fpu)
-);
-
DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
TP_PROTO(struct fpu *fpu),
TP_ARGS(fpu)
--
2.14.2

View File

@ -0,0 +1,273 @@
From dcc61cf4d482d478979471795935733845fe757e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 13 Oct 2017 15:02:00 -0500
Subject: [PATCH 063/231] x86/unwind: Rename unwinder config options to
'CONFIG_UNWINDER_*'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Rename the unwinder config options from:
CONFIG_ORC_UNWINDER
CONFIG_FRAME_POINTER_UNWINDER
CONFIG_GUESS_UNWINDER
to:
CONFIG_UNWINDER_ORC
CONFIG_UNWINDER_FRAME_POINTER
CONFIG_UNWINDER_GUESS
... in order to give them a more logical config namespace.
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 11af847446ed0d131cf24d16a7ef3d5ea7a49554)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 27ab2a240a797b073ce63385b1d5db06e44fc3ae)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
Documentation/x86/orc-unwinder.txt | 2 +-
Makefile | 4 ++--
arch/x86/kernel/Makefile | 6 +++---
scripts/Makefile.build | 2 +-
arch/x86/include/asm/module.h | 2 +-
arch/x86/include/asm/unwind.h | 8 ++++----
include/asm-generic/vmlinux.lds.h | 2 +-
arch/x86/Kconfig | 2 +-
arch/x86/Kconfig.debug | 10 +++++-----
arch/x86/configs/tiny.config | 4 ++--
arch/x86/configs/x86_64_defconfig | 2 +-
lib/Kconfig.debug | 2 +-
12 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt
index af0c9a4c65a6..cd4b29be29af 100644
--- a/Documentation/x86/orc-unwinder.txt
+++ b/Documentation/x86/orc-unwinder.txt
@@ -4,7 +4,7 @@ ORC unwinder
Overview
--------
-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is
similar in concept to a DWARF unwinder. The difference is that the
format of the ORC data is much simpler than DWARF, which in turn allows
the ORC unwinder to be much simpler and faster.
diff --git a/Makefile b/Makefile
index 490ce18685ea..b740e3dc9ff8 100644
--- a/Makefile
+++ b/Makefile
@@ -965,8 +965,8 @@ ifdef CONFIG_STACK_VALIDATION
ifeq ($(has_libelf),1)
objtool_target := tools/objtool FORCE
else
- ifdef CONFIG_ORC_UNWINDER
- $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+ ifdef CONFIG_UNWINDER_ORC
+ $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
else
$(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e2315aecc441..5bf0d5a473b4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -125,9 +125,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
###
# 64 bit specific files
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index ab2c8ef43cdb..436005392047 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1)
__objtool_obj := $(objtree)/tools/objtool/objtool
-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)
ifndef CONFIG_FRAME_POINTER
objtool_args += --no-fp
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 9eb7c718aaf8..9f05a1002aa9 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,7 +5,7 @@
#include <asm/orc_types.h>
struct mod_arch_specific {
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e9f793e2df7a..35d67dc7b69f 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,11 @@ struct unwind_state {
struct task_struct *task;
int graph_idx;
bool error;
-#if defined(CONFIG_ORC_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC)
bool signal, full_regs;
unsigned long sp, bp, ip;
struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
struct pt_regs *regs;
@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
__unwind_start(state, task, regs, first_frame);
}
-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
@@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
}
#endif
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
void *orc, size_t orc_size);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9fdb54a95976..e71e42432360 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -686,7 +686,7 @@
#define BUG_TABLE
#endif
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
#define ORC_UNWIND_TABLE \
. = ALIGN(4); \
.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3a0b8cb57caf..bf9f03740c30 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -168,7 +168,7 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c441b5d65ec8..5435a943f894 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG
choice
prompt "Choose kernel unwinder"
- default FRAME_POINTER_UNWINDER
+ default UNWINDER_FRAME_POINTER
---help---
This determines which method will be used for unwinding kernel stack
traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
livepatch, lockdep, and more.
-config FRAME_POINTER_UNWINDER
+config UNWINDER_FRAME_POINTER
bool "Frame pointer unwinder"
select FRAME_POINTER
---help---
@@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER
consistency model, as this is currently the only way to get a
reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-config ORC_UNWINDER
+config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
select STACK_VALIDATION
@@ -396,7 +396,7 @@ config ORC_UNWINDER
Enabling this option will increase the kernel's runtime memory usage
by roughly 2-4MB, depending on your kernel config.
-config GUESS_UNWINDER
+config UNWINDER_GUESS
bool "Guess unwinder"
depends on EXPERT
---help---
@@ -411,7 +411,7 @@ config GUESS_UNWINDER
endchoice
config FRAME_POINTER
- depends on !ORC_UNWINDER && !GUESS_UNWINDER
+ depends on !UNWINDER_ORC && !UNWINDER_GUESS
bool
endmenu
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 550cd5012b73..66c9e2aab16c 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,5 +1,5 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
-CONFIG_GUESS_UNWINDER=y
-# CONFIG_FRAME_POINTER_UNWINDER is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index eb65c248708d..e32fc1f274d8 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
-CONFIG_ORC_UNWINDER=y
+CONFIG_UNWINDER_ORC=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0b4d1b3880b0..4f6ca5f60f7e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -375,7 +375,7 @@ config STACK_VALIDATION
that runtime stack traces are more reliable.
This is also a prerequisite for generation of ORC unwind data, which
- is needed for CONFIG_ORC_UNWINDER.
+ is needed for CONFIG_UNWINDER_ORC.
For more information, see
tools/objtool/Documentation/stack-validation.txt.
--
2.14.2

View File

@ -0,0 +1,90 @@
From a8ec58033a185db5d8c180d3508d34b8ae3a1c89 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 13 Oct 2017 15:02:01 -0500
Subject: [PATCH 064/231] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in
kconfig for 64-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The ORC unwinder has been stable in testing so far. Give it much wider
testing by making it the default in kconfig for x86_64. It's not yet
supported for 32-bit, so leave frame pointers as the default there.
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit fc72ae40e30327aa24eb88a24b9c7058f938bd36)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit aff8d5169f46ae6ac0eb26a5ba745aaf9afa0704)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/Kconfig.debug | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 5435a943f894..7d88e9878a75 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG
choice
prompt "Choose kernel unwinder"
- default UNWINDER_FRAME_POINTER
+ default UNWINDER_ORC if X86_64
+ default UNWINDER_FRAME_POINTER if X86_32
---help---
This determines which method will be used for unwinding kernel stack
traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
livepatch, lockdep, and more.
-config UNWINDER_FRAME_POINTER
- bool "Frame pointer unwinder"
- select FRAME_POINTER
- ---help---
- This option enables the frame pointer unwinder for unwinding kernel
- stack traces.
-
- The unwinder itself is fast and it uses less RAM than the ORC
- unwinder, but the kernel text size will grow by ~3% and the kernel's
- overall performance will degrade by roughly 5-10%.
-
- This option is recommended if you want to use the livepatch
- consistency model, as this is currently the only way to get a
- reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-
config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
@@ -396,6 +382,21 @@ config UNWINDER_ORC
Enabling this option will increase the kernel's runtime memory usage
by roughly 2-4MB, depending on your kernel config.
+config UNWINDER_FRAME_POINTER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
config UNWINDER_GUESS
bool "Guess unwinder"
depends on EXPERT
--
2.14.2

View File

@ -0,0 +1,69 @@
From 2f76ec868c18486b60f1b76428339a2fa0c2e5d8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 13 Oct 2017 14:56:41 -0700
Subject: [PATCH 065/231] bitops: Add clear/set_bit32() to linux/bitops.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Add two simple wrappers around set_bit/clear_bit() that accept
the common case of an u32 array. This avoids writing
casts in all callers.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit cbe96375025e14fc76f9ed42ee5225120d7210f8)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 06d31c11519ca0e8f9b7cab857f442ef44dfc1b2)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/linux/bitops.h | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a83c822c35c2..eb257a96db6d 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -226,6 +226,32 @@ static inline unsigned long __ffs64(u64 word)
return __ffs((unsigned long)word);
}
+/*
+ * clear_bit32 - Clear a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as clear_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void clear_bit32(long nr, volatile u32 *addr)
+{
+ clear_bit(nr, (volatile unsigned long *)addr);
+}
+
+/*
+ * set_bit32 - Set a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as set_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void set_bit32(long nr, volatile u32 *addr)
+{
+ set_bit(nr, (volatile unsigned long *)addr);
+}
+
#ifdef __KERNEL__
#ifndef set_mask_bits
--
2.14.2

View File

@ -0,0 +1,221 @@
From d637e8b6db21d282cfb1fd789ae60807cc87c867 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 13 Oct 2017 14:56:42 -0700
Subject: [PATCH 066/231] x86/cpuid: Add generic table for CPUID dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Some CPUID features depend on other features. Currently it's
possible to to clear dependent features, but not clear the base features,
which can cause various interesting problems.
This patch implements a generic table to describe dependencies
between CPUID features, to be used by all code that clears
CPUID.
Some subsystems (like XSAVE) had an own implementation of this,
but it's better to do it all in a single place for everyone.
Then clear_cpu_cap and setup_clear_cpu_cap always look up
this table and clear all dependencies too.
This is intended to be a practical table: only for features
that make sense to clear. If someone for example clears FPU,
or other features that are essentially part of the required
base feature set, not much is going to work. Handling
that is right now out of scope. We're only handling
features which can be usefully cleared.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jonathan McDowell <noodles@earth.li>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 0b00de857a648dafe7020878c7a27cf776f5edf4)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 35672522f2fc9a2e116ed1766f190bc08ef5582a)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/include/asm/cpufeature.h | 9 ++-
arch/x86/include/asm/cpufeatures.h | 5 ++
arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++++++++++
4 files changed, 123 insertions(+), 5 deletions(-)
create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e17942c131c8..de260fae1017 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -22,6 +22,7 @@ obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d59c15c3defd..225fd8374fae 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
- clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5a28e8e55e36..f4e145c4b06f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -21,6 +21,11 @@
* this feature bit is not displayed in /proc/cpuinfo at all.
*/
+/*
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c
+ */
+
/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..e48eb7313120
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,113 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit)
+{
+ clear_bit32(bit, c->x86_capability);
+}
+
+static inline void __setup_clear_cpu_cap(unsigned int bit)
+{
+ clear_cpu_cap(&boot_cpu_data, bit);
+ set_bit32(bit, cpu_caps_cleared);
+}
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ if (!c)
+ __setup_clear_cpu_cap(feature);
+ else
+ __clear_cpu_cap(c, feature);
+}
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ bool changed;
+ DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8);
+ const struct cpuid_dep *d;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
--
2.14.2

View File

@ -0,0 +1,97 @@
From df469cffe07c84906be43e89d33f2a8a5312e60f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 13 Oct 2017 14:56:43 -0700
Subject: [PATCH 067/231] x86/fpu: Parse clearcpuid= as early XSAVE argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
With a followon patch we want to make clearcpuid affect the XSAVE
configuration. But xsave is currently initialized before arguments
are parsed. Move the clearcpuid= parsing into the special
early xsave argument parsing code.
Since clearcpuid= contains a = we need to keep the old __setup
around as a dummy, otherwise it would end up as a environment
variable in init's environment.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 0c2a3913d6f50503f7c59d83a6219e39508cc898)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 27deb452eb0d27c406f3817ab057201aa8767abe)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/cpu/common.c | 16 +++++++---------
arch/x86/kernel/fpu/init.c | 11 +++++++++++
2 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4be7b209a3d6..ef7b1ba56363 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1293,18 +1293,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
{
- int bit;
-
- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
-
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr __ro_after_init = {
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d5d44c452624..07f0ab877f49 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
+ char arg[32];
+ char *argptr = arg;
+ int bit;
+
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+ sizeof(arg)) &&
+ get_option(&argptr, &bit) &&
+ bit >= 0 &&
+ bit < NCAPINTS * 32)
+ setup_clear_cpu_cap(bit);
}
/*
--
2.14.2

View File

@ -0,0 +1,90 @@
From 1b88ea4170f72b4fed72e9235c88b6121f153b21 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 13 Oct 2017 14:56:44 -0700
Subject: [PATCH 068/231] x86/fpu: Make XSAVE check the base CPUID features
before enabling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Before enabling XSAVE, not only check the XSAVE specific CPUID bits,
but also the base CPUID features of the respective XSAVE feature.
This allows to disable individual XSAVE states using the existing
clearcpuid= option, which can be useful for performance testing
and debugging, and also in general avoids inconsistencies.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit ccb18db2ab9d923df07e7495123fe5fb02329713)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 2efda26f9ee0eeb9919772e90ca30dbe59008dc8)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c24ac1efb12d..3abe85b08234 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,6 +15,7 @@
#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
/*
* Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
"unknown xstate feature" ,
};
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
@@ -702,6 +716,7 @@ void __init fpu__init_system_xstate(void)
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
int err;
+ int i;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -735,6 +750,14 @@ void __init fpu__init_system_xstate(void)
goto out_disable;
}
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
--
2.14.2

View File

@ -0,0 +1,70 @@
From 9b2405a12593b1ba7894cf249ddeada92a658463 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 13 Oct 2017 14:56:45 -0700
Subject: [PATCH 069/231] x86/fpu: Remove the explicit clearing of XSAVE
dependent features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Clearing a CPU feature with setup_clear_cpu_cap() clears all features
which depend on it. Expressing feature dependencies in one place is
easier to maintain than keeping functions like
fpu__xstate_clear_all_cpu_caps() up to date.
The features which depend on XSAVE have their dependency expressed in the
dependency table, so its sufficient to clear X86_FEATURE_XSAVE.
Remove the explicit clearing of XSAVE dependent features.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit af445f9ba8bb30b47ccb5247b8f5ba28c9f2be3e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/fpu/xstate.c | 20 --------------------
1 file changed, 20 deletions(-)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 3abe85b08234..fd6882c42246 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size;
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
- setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
- setup_clear_cpu_cap(X86_FEATURE_PKU);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
--
2.14.2

View File

@ -0,0 +1,57 @@
From 6d96a02c961d41d82738bce9806c430d99acc9f8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 16:22:31 -0700
Subject: [PATCH 070/231] x86/platform/UV: Convert timers to use timer_setup()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Cc: Russ Anderson <rja@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Link: https://lkml.kernel.org/r/20171016232231.GA100493@beast
(cherry picked from commit 376f3bcebdc999cc737d9052109cc33b573b3a8b)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 869cbd2b31024e70d574527b8c6851bf2ebbe483)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 0d57bb9079c9..c0b694810ff4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -920,9 +920,8 @@ static __init void uv_rtc_init(void)
/*
* percpu heartbeat timer
*/
-static void uv_heartbeat(unsigned long ignored)
+static void uv_heartbeat(struct timer_list *timer)
{
- struct timer_list *timer = &uv_scir_info->timer;
unsigned char bits = uv_scir_info->state;
/* Flip heartbeat bit: */
@@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- setup_pinned_timer(timer, uv_heartbeat, cpu);
+ timer_setup(timer, uv_heartbeat, TIMER_PINNED);
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_scir_info(cpu)->enabled = 1;
--
2.14.2

View File

@ -0,0 +1,73 @@
From ca358ca3d22248f099a09d65ee25410cf3beebc5 Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Date: Sat, 14 Oct 2017 20:17:54 +0530
Subject: [PATCH 071/231] objtool: Print top level commands on incorrect usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Print top-level objtool commands, along with the error on incorrect
command line usage. Objtool command line parser exit's with code 129,
for incorrect usage. Convert the cmd_usage() exit code also, to maintain
consistency across objtool.
After the patch:
$ ./objtool -j
Unknown option: -j
usage: objtool COMMAND [ARGS]
Commands:
check Perform stack metadata validation on an object file
orc Generate in-place ORC unwind tables for an object file
$ echo $?
129
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit cd75c9c55a5f288e1d3f20c48c5c4c2caf3966e8)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
tools/objtool/objtool.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 31e0f9143840..07f329919828 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -70,7 +70,7 @@ static void cmd_usage(void)
printf("\n");
- exit(1);
+ exit(129);
}
static void handle_options(int *argc, const char ***argv)
@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv)
break;
} else {
fprintf(stderr, "Unknown option: %s\n", cmd);
- fprintf(stderr, "\n Usage: %s\n",
- objtool_usage_string);
- exit(1);
+ cmd_usage();
}
(*argv)++;
--
2.14.2

View File

@ -0,0 +1,65 @@
From a827c0ac43c2dc1e5e0528ebd4b2ca2d74534e18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Oct 2017 19:39:35 +0200
Subject: [PATCH 072/231] x86/cpuid: Prevent out of bound access in
do_clear_cpu_cap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature
dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible
'features' which can be handed in are larger than this, because after the
capabilities the bug 'feature' bits occupy another 32bit. Not really
obvious...
So clearing any of the misfeature bits, as 32bit does for the F00F bug,
accesses that bitmap out of bounds thereby corrupting the stack.
Size the bitmap proper and add a sanity check to catch accidental out of
bound access.
Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies")
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop
(cherry picked from commit 57b8b1a1856adaa849d02d547411a553a531022b)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 4b3a90bd20b35a97fd9ca6f6a71131f4417782e4)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index e48eb7313120..c1d49842a411 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
__clear_cpu_cap(c, feature);
}
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
{
- bool changed;
- DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8);
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
clear_feature(c, feature);
--
2.14.2

View File

@ -0,0 +1,125 @@
From 1e3688f9e76b3d8b218ed1afa292585a91b0b0c6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 18 Oct 2017 10:21:07 -0700
Subject: [PATCH 073/231] x86/entry: Use SYSCALL_DEFINE() macros for
sys_modify_ldt()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
We do not have tracepoints for sys_modify_ldt() because we define
it directly instead of using the normal SYSCALL_DEFINEx() macros.
However, there is a reason sys_modify_ldt() does not use the macros:
it has an 'int' return type instead of 'unsigned long'. This is
a bug, but it's a bug cemented in the ABI.
What does this mean? If we return -EINVAL from a function that
returns 'int', we have 0x00000000ffffffea in %rax. But, if we
return -EINVAL from a function returning 'unsigned long', we end
up with 0xffffffffffffffea in %rax, which is wrong.
To work around this and maintain the 'int' behavior while using
the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int'
in both implementations of sys_modify_ldt().
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit da20ab35180780e4a6eadc804544f1fa967f3567)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d865f635f4b2c3307e79de9be5c49ea8bd4c43a6)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/syscalls.h | 2 +-
arch/x86/kernel/ldt.c | 16 +++++++++++++---
arch/x86/um/ldt.c | 7 +++++--
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 91dfcafe27a6..bad25bb80679 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
asmlinkage long sys_iopl(unsigned int);
/* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);
/* kernel/signal.c */
asmlinkage long sys_rt_sigreturn(void);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f0e64db18ac8..0402d44deb4d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -294,8 +295,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
@@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 836a1eb5df43..3ee234b6234d 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <os.h>
@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm)
mm->arch.ldt.entry_count = 0;
}
-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
- return do_modify_ldt_skas(func, ptr, bytecount);
+ /* See non-um modify_ldt() for why we do this cast */
+ return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
}
--
2.14.2

View File

@ -0,0 +1,141 @@
From cc87e9d44044fb3ae4145d6ad9574697439b03bf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 29 Sep 2017 17:08:16 +0300
Subject: [PATCH 074/231] mm/sparsemem: Allocate mem_section at runtime for
CONFIG_SPARSEMEM_EXTREME=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Size of the mem_section[] array depends on the size of the physical address space.
In preparation for boot-time switching between paging modes on x86-64
we need to make the allocation of mem_section[] dynamic, because otherwise
we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB
for 4-level paging and 2MB for 5-level paging mode.
The patch allocates the array on the first call to sparse_memory_present_with_active_regions().
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit c70f71e01a0ae5d884abae0424618abe90b82011)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/linux/mmzone.h | 6 +++++-
mm/page_alloc.c | 10 ++++++++++
mm/sparse.c | 17 +++++++++++------
3 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..9c6c001a8c6c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,13 +1137,17 @@ struct mem_section {
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
#ifdef CONFIG_SPARSEMEM_EXTREME
-extern struct mem_section *mem_section[NR_SECTION_ROOTS];
+extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (!mem_section)
+ return NULL;
+#endif
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1423da8dd16f..66eb23ab658d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5707,6 +5707,16 @@ void __init sparse_memory_present_with_active_regions(int nid)
unsigned long start_pfn, end_pfn;
int i, this_nid;
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (!mem_section) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_virt_alloc(size, align);
+ }
+#endif
+
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
memory_present(this_nid, start_pfn, end_pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index cdce7a7bb3f3..308a0789d1bb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -22,8 +22,7 @@
* 1) mem_section - memory sections, mem_map's for valid memory
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
-struct mem_section *mem_section[NR_SECTION_ROOTS]
- ____cacheline_internodealigned_in_smp;
+struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
@@ -104,7 +103,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
int __section_nr(struct mem_section* ms)
{
unsigned long root_nr;
- struct mem_section* root;
+ struct mem_section *root = NULL;
for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
@@ -115,7 +114,7 @@ int __section_nr(struct mem_section* ms)
break;
}
- VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
+ VM_BUG_ON(!root);
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
@@ -333,11 +332,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
{
unsigned long usemap_snr, pgdat_snr;
- static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
- static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+ static unsigned long old_usemap_snr;
+ static unsigned long old_pgdat_snr;
struct pglist_data *pgdat = NODE_DATA(nid);
int usemap_nid;
+ /* First call */
+ if (!old_usemap_snr) {
+ old_usemap_snr = NR_MEM_SECTIONS;
+ old_pgdat_snr = NR_MEM_SECTIONS;
+ }
+
usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
--
2.14.2

View File

@ -0,0 +1,244 @@
From f6bb8e560b2229af5dcf3127fc92e732539b4823 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 29 Sep 2017 17:08:18 +0300
Subject: [PATCH 075/231] x86/kasan: Use the same shadow offset for 4- and
5-level paging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
We are going to support boot-time switching between 4- and 5-level
paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET
for different paging modes: the constant is passed to gcc to generate
code and cannot be changed at runtime.
This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset
for both 4- and 5-level paging.
For 5-level paging it means that shadow memory region is not aligned to
PGD boundary anymore and we have to handle unaligned parts of the region
properly.
In addition, we have to exclude paravirt code from KASAN instrumentation
as we now use set_pgd() before KASAN is fully ready.
[kirill.shutemov@linux.intel.com: clenaup, changelog message]
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170929140821.37654-4-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 12a8cc7fcf54a8575f094be1e99032ec38aa045c)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 2ce428150e002623aa0ed2a1ab840fde5f860f32)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
Documentation/x86/x86_64/mm.txt | 2 +-
arch/x86/kernel/Makefile | 3 +-
arch/x86/mm/kasan_init_64.c | 101 +++++++++++++++++++++++++++++++---------
arch/x86/Kconfig | 1 -
4 files changed, 83 insertions(+), 24 deletions(-)
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index b0798e281aa6..3448e675b462 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
-ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
+ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5bf0d5a473b4..aa059806201d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,8 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..464089f33e80 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,8 @@
extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
static int __init map_range(struct range *range)
{
unsigned long start;
@@ -30,8 +32,10 @@ static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
+ /* See comment in kasan_init() */
+ unsigned long pgd_end = end & PGDIR_MASK;
- for (; start < end; start += PGDIR_SIZE) {
+ for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -42,29 +46,61 @@ static void __init clear_pgds(unsigned long start,
else
pgd_clear(pgd);
}
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return (p4d_t *)pgd;
+
+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
- int i;
- unsigned long start = KASAN_SHADOW_START;
+ /* See comment in kasan_init() */
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
- for (i = pgd_index(start); start < end; i++) {
- switch (CONFIG_PGTABLE_LEVELS) {
- case 4:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
- _KERNPG_TABLE);
- break;
- case 5:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
- _KERNPG_TABLE);
- break;
- default:
- BUILD_BUG();
- }
- start += PGDIR_SIZE;
- }
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
}
#ifdef CONFIG_KASAN_INLINE
@@ -101,7 +137,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -117,12 +153,35 @@ void __init kasan_init(void)
#endif
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ /*
+ * We use the same shadow offset for 4- and 5-level paging to
+ * facilitate boot-time switching between paging modes.
+ * As result in 5-level paging mode KASAN_SHADOW_START and
+ * KASAN_SHADOW_END are not aligned to PGD boundary.
+ *
+ * KASAN_SHADOW_START doesn't share PGD with anything else.
+ * We claim whole PGD entry to make things easier.
+ *
+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+ * bunch of things like kernel code, modules, EFI mapping, etc.
+ * We need to take extra steps to not overwrite them.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
load_cr3(early_top_pgt);
__flush_tlb_all();
- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bf9f03740c30..67d07802ae95 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -300,7 +300,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
- default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000
config HAVE_INTEL_TXT
--
2.14.2

View File

@ -0,0 +1,80 @@
From cac8711a9ba742e97090cc5ec522360f1549c584 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 29 Sep 2017 17:08:19 +0300
Subject: [PATCH 076/231] x86/xen: Provide pre-built page tables only for
CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Looks like we only need pre-built page tables in the CONFIG_XEN_PV=y and
CONFIG_XEN_PVH=y cases.
Let's not provide them for other configurations.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170929140821.37654-5-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 4375c29985f155d7eb2346615d84e62d1b673682)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a883ee7f3c1dc64a8c946543ac598399353d1b03)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/kernel/head_64.S | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d081bc7a027d..12daaa0b187f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,11 +37,12 @@
*
*/
-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -348,10 +349,7 @@ NEXT_PAGE(early_dynamic_pgts)
.data
-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
- .fill 512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
@@ -368,6 +366,9 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+ .fill 512,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
--
2.14.2

View File

@ -0,0 +1,316 @@
From ed422950e50aeb9a05920e7387b4dd7c8dc2fc67 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 29 Sep 2017 17:08:20 +0300
Subject: [PATCH 077/231] x86/xen: Drop 5-level paging support code from the
XEN_PV code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
It was decided 5-level paging is not going to be supported in XEN_PV.
Let's drop the dead code from the XEN_PV code.
Tested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170929140821.37654-6-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 773dd2fca581b0a80e5a33332cc8ee67e5a79cba)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 3fd0b7ef0094fd8bb3c8172d9b137ebe0d81ecbc)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/mmu_pv.c | 159 +++++++++++++++++++-------------------------------
1 file changed, 60 insertions(+), 99 deletions(-)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ba76f3ce997f..45bb2d462e44 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -469,7 +469,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if CONFIG_PGTABLE_LEVELS == 4
+#ifdef CONFIG_X86_64
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -558,7 +558,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */
static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
@@ -600,21 +600,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
- int i, nr, flush = 0;
+ int flush = 0;
+ pud_t *pud;
- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
- for (i = 0; i < nr; i++) {
- pud_t *pud;
- if (p4d_none(p4d[i]))
- continue;
+ if (p4d_none(*p4d))
+ return flush;
- pud = pud_offset(&p4d[i], 0);
- if (PTRS_PER_PUD > 1)
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- flush |= xen_pud_walk(mm, pud, func,
- last && i == nr - 1, limit);
- }
+ pud = pud_offset(p4d, 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func, last, limit);
return flush;
}
@@ -664,8 +660,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
continue;
p4d = p4d_offset(&pgd[i], 0);
- if (PTRS_PER_P4D > 1)
- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
@@ -1196,22 +1190,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr)
{
pgd_t *pgd;
p4d_t *p4d;
- unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
vaddr &= PMD_MASK;
pgd = pgd_offset_k(vaddr);
p4d = p4d_offset(pgd, 0);
- for (i = 0; i < PTRS_PER_P4D; i++) {
- if (p4d_none(p4d[i]))
- continue;
- xen_cleanmfnmap_p4d(p4d + i, unpin);
- }
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- set_pgd(pgd, __pgd(0));
- xen_cleanmfnmap_free_pgtbl(p4d, unpin);
- }
+ if (!p4d_none(*p4d))
+ xen_cleanmfnmap_p4d(p4d, unpin);
}
static void __init xen_pagetable_p2m_free(void)
@@ -1717,7 +1703,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2054,13 +2040,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
*/
void __init xen_relocate_p2m(void)
{
- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
- p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
int save_pud;
@@ -2070,11 +2055,7 @@ void __init xen_relocate_p2m(void)
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
- if (PTRS_PER_P4D > 1)
- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
- else
- n_p4d = 0;
- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
@@ -2090,76 +2071,56 @@ void __init xen_relocate_p2m(void)
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
- p4d_phys = new_area;
- pud_phys = p4d_phys + PFN_PHYS(n_p4d);
+ pud_phys = new_area;
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- idx_p4d = 0;
save_pud = n_pud;
- do {
- if (n_p4d > 0) {
- p4d = early_memremap(p4d_phys, PAGE_SIZE);
- clear_page(p4d);
- n_pud = min(save_pud, PTRS_PER_P4D);
- }
- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
- pud = early_memremap(pud_phys, PAGE_SIZE);
- clear_page(pud);
- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
- idx_pmd++) {
- pmd = early_memremap(pmd_phys, PAGE_SIZE);
- clear_page(pmd);
- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
- idx_pt++) {
- pt = early_memremap(pt_phys, PAGE_SIZE);
- clear_page(pt);
- for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
- p2m_pfn++;
- }
- n_pte -= PTRS_PER_PTE;
- early_memunmap(pt, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pt_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
- PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
- pt_phys += PAGE_SIZE;
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
}
- n_pt -= PTRS_PER_PMD;
- early_memunmap(pmd, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pmd_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
- PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
- pmd_phys += PAGE_SIZE;
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
}
- n_pmd -= PTRS_PER_PUD;
- early_memunmap(pud, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pud_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
- if (n_p4d > 0)
- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
- else
- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
- pud_phys += PAGE_SIZE;
- }
- if (n_p4d > 0) {
- save_pud -= PTRS_PER_P4D;
- early_memunmap(p4d, PAGE_SIZE);
- make_lowmem_page_readonly(__va(p4d_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
- p4d_phys += PAGE_SIZE;
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
}
- } while (++idx_p4d < n_p4d);
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
@@ -2386,7 +2347,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.set_p4d = xen_set_p4d;
#endif
@@ -2396,7 +2357,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2460,14 +2421,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_p4d = xen_set_p4d_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_X86_64 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
--
2.14.2

View File

@ -0,0 +1,88 @@
From 8edefc098e2ff088d8c22d034bf8a5adf76b7295 Mon Sep 17 00:00:00 2001
From: Dongjiu Geng <gengdongjiu@huawei.com>
Date: Tue, 17 Oct 2017 16:02:20 +0800
Subject: [PATCH 078/231] ACPI / APEI: remove the unused dead-code for SEA/NMI
notification type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
For the SEA notification, the two functions ghes_sea_add() and
ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA
is defined. If not, it will return errors in the ghes_probe()
and not continue. If the probe is failed, the ghes_sea_remove()
also has no chance to be called. Hence, remove the unnecessary
handling when CONFIG_ACPI_APEI_SEA is not defined.
For the NMI notification, it has the same issue as SEA notification,
so also remove the unused dead-code for it.
Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
(cherry picked from commit c49870e89f4d2c21c76ebe90568246bb0f3572b7)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 55f73c32ba6438e8886f348722d2b25aef129d40)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
drivers/acpi/apei/ghes.c | 33 +++++----------------------------
1 file changed, 5 insertions(+), 28 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 3628078ee351..4827176f838d 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -850,17 +850,8 @@ static void ghes_sea_remove(struct ghes *ghes)
synchronize_rcu();
}
#else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes)
-{
- pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n",
- ghes->generic->header.source_id);
-}
-
-static inline void ghes_sea_remove(struct ghes *ghes)
-{
- pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n",
- ghes->generic->header.source_id);
-}
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
#endif /* CONFIG_ACPI_APEI_SEA */
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -1062,23 +1053,9 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
}
#else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes)
-{
- pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n",
- ghes->generic->header.source_id);
- BUG();
-}
-
-static inline void ghes_nmi_remove(struct ghes *ghes)
-{
- pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n",
- ghes->generic->header.source_id);
- BUG();
-}
-
-static inline void ghes_nmi_init_cxt(void)
-{
-}
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+static inline void ghes_nmi_init_cxt(void) { }
#endif /* CONFIG_HAVE_ACPI_APEI_NMI */
static int ghes_probe(struct platform_device *ghes_dev)
--
2.14.2

View File

@ -0,0 +1,78 @@
From 05096d194a52721b3f4add5f854fc62296b82e72 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 20 Oct 2017 11:21:35 -0500
Subject: [PATCH 079/231] x86/asm: Don't use the confusing '.ifeq' directive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
I find the '.ifeq <expression>' directive to be confusing. Reading it
quickly seems to suggest its opposite meaning, or that it's missing an
argument.
Improve readability by replacing all of its x86 uses with
'.if <expression> == 0'.
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrei Vagin <avagin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 82c62fa0c49aa305104013cee4468772799bb391)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 981dedac1061fb47d0b04e07f6752be195d7e41a)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 2 +-
arch/x86/kernel/head_32.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2e4fc6425f47..34adfe0221d2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -830,7 +830,7 @@ ENTRY(\sym)
ASM_CLAC
- .ifeq \has_error_code
+ .if \has_error_code == 0
pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..337a65377baf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -435,7 +435,7 @@ ENTRY(early_idt_handler_array)
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 12daaa0b187f..a2d8541b1da4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -258,7 +258,7 @@ ENDPROC(start_cpu0)
ENTRY(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
UNWIND_HINT_IRET_REGS
pushq $0 # Dummy error code, to make stack frame uniform
.else
--
2.14.2

View File

@ -0,0 +1,62 @@
From 183c7a0eddfea6359e977cc5216972b4cc875e0d Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 27 Oct 2017 13:11:10 +0900
Subject: [PATCH 080/231] x86/build: Beautify build log of syscall headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This makes the build log look nicer.
Before:
SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h
SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h
SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_64_x32.h
SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_64.h
SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h
SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_64.h
SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_x32.h
After:
SYSTBL arch/x86/include/generated/asm/syscalls_32.h
SYSHDR arch/x86/include/generated/asm/unistd_32_ia32.h
SYSHDR arch/x86/include/generated/asm/unistd_64_x32.h
SYSTBL arch/x86/include/generated/asm/syscalls_64.h
SYSHDR arch/x86/include/generated/uapi/asm/unistd_32.h
SYSHDR arch/x86/include/generated/uapi/asm/unistd_64.h
SYSHDR arch/x86/include/generated/uapi/asm/unistd_x32.h
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kbuild@vger.kernel.org
Link: http://lkml.kernel.org/r/1509077470-2735-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit af8e947079a7dab0480b5d6db6b093fd04b86fc9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d945957924e9b1a469516b4029fd384138c2cb69)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/syscalls/Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index 57aa59fd140c..e34c7a931994 100644
--- a/arch/x86/entry/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -1,5 +1,5 @@
-out := $(obj)/../../include/generated/asm
-uapi := $(obj)/../../include/generated/uapi/asm
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
# Create output directory if not already present
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
--
2.14.2

View File

@ -0,0 +1,90 @@
From 32cae4ea1b3927843b18c32e8e1cdfab8a0b2c19 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sat, 28 Oct 2017 09:30:38 +0800
Subject: [PATCH 081/231] x86/mm/64: Rename the register_page_bootmem_memmap()
'size' parameter to 'nr_pages'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
register_page_bootmem_memmap()'s 3rd 'size' parameter is named
in a somewhat misleading fashion - rename it to 'nr_pages' which
makes the units of it much clearer.
Meanwhile rename the existing local variable 'nr_pages' to
'nr_pmd_pages', a more expressive name, to avoid conflict with
new function parameter 'nr_pages'.
(Also clean up the unnecessary parentheses in which get_order() is called.)
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: akpm@linux-foundation.org
Link: http://lkml.kernel.org/r/1509154238-23250-1-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 15670bfe19905b1dcbb63137f40d718b59d84479)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d73ad1d31ef8a44c6e5977c5123cbaa6d02e2035)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
include/linux/mm.h | 2 +-
arch/x86/mm/init_64.c | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 07630442bbf2..97f6ca707010 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2475,7 +2475,7 @@ void vmemmap_populate_print_last(void);
void vmemmap_free(unsigned long start, unsigned long end);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
- unsigned long size);
+ unsigned long nr_pages);
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..902983c8ea8c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1418,16 +1418,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
- struct page *start_page, unsigned long size)
+ struct page *start_page, unsigned long nr_pages)
{
unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- unsigned int nr_pages;
+ unsigned int nr_pmd_pages;
struct page *page;
for (; addr < end; addr = next) {
@@ -1474,9 +1474,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
if (pmd_none(*pmd))
continue;
- nr_pages = 1 << (get_order(PMD_SIZE));
+ nr_pmd_pages = 1 << get_order(PMD_SIZE);
page = pmd_page(*pmd);
- while (nr_pages--)
+ while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}
--
2.14.2

View File

@ -0,0 +1,87 @@
From 59557ab0237e7474402d4240c55f119a86dadc7d Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Mon, 30 Oct 2017 18:20:29 -0700
Subject: [PATCH 082/231] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU
features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration
in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI,
AVX512_BITALG.
CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2
CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI
CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES
CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ
CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI
CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG
Detailed information of CPUID bits for these features can be found
in the Intel Architecture Instruction Set Extensions and Future Features
Programming Interface document (refer to Table 1-1. and Table 1-2.).
A copy of this document is available at
https://bugzilla.kernel.org/show_bug.cgi?id=197239
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Yang Zhong <yang.zhong@intel.com>
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit c128dbfa0f879f8ce7b79054037889b0b2240728)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit b29eb29c5aca4708d66fa977db40c779366636a2)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/cpufeatures.h | 6 ++++++
arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f4e145c4b06f..c465bd6613ed 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -297,6 +297,12 @@
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index c1d49842a411..c21f22d836ad 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
--
2.14.2

View File

@ -0,0 +1,363 @@
From 9e6bc95ae1c4b92d9838ee8d2ee8b0e65f4e4469 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Fri, 27 Oct 2017 13:25:28 -0700
Subject: [PATCH 083/231] x86/mm: Relocate page fault error codes to traps.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Up to this point, only fault.c used the definitions of the page fault error
codes. Thus, it made sense to keep them within such file. Other portions of
code might be interested in those definitions too. For instance, the User-
Mode Instruction Prevention emulation code will use such definitions to
emulate a page fault when it is unable to successfully copy the results
of the emulated instructions to user space.
While relocating the error code enumeration, the prefix X86_ is used to
make it consistent with the rest of the definitions in traps.h. Of course,
code using the enumeration had to be updated as well. No functional changes
were performed.
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: ricardo.neri@intel.com
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com
(cherry picked from commit 1067f030994c69ca1fba8c607437c8895dcf8509)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit a85a07ab9111e3c78797c20b60a664dbd5db4981)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/traps.h | 18 +++++++++
arch/x86/mm/fault.c | 88 +++++++++++++++++---------------------------
2 files changed, 52 insertions(+), 54 deletions(-)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index feb89dbe359d..8e5bf86f87e5 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -162,4 +162,22 @@ enum {
X86_TRAP_IRET = 32, /* 32, IRET Exception */
};
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
+ */
+enum x86_pf_error_code {
+ X86_PF_PROT = 1 << 0,
+ X86_PF_WRITE = 1 << 1,
+ X86_PF_USER = 1 << 2,
+ X86_PF_RSVD = 1 << 3,
+ X86_PF_INSTR = 1 << 4,
+ X86_PF_PK = 1 << 5,
+};
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4ee9eb916826..d3a57e7ad311 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -28,26 +28,6 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
-/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- */
-enum x86_pf_error_code {
-
- PF_PROT = 1 << 0,
- PF_WRITE = 1 << 1,
- PF_USER = 1 << 2,
- PF_RSVD = 1 << 3,
- PF_INSTR = 1 << 4,
- PF_PK = 1 << 5,
-};
-
/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
@@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
- * do not. The PF_PK handing happens after we have a
+ * do not. The X86_PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
@@ -693,7 +673,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (!oops_may_print())
return;
- if (error_code & PF_INSTR) {
+ if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
@@ -775,7 +755,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
*/
if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
@@ -894,7 +874,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* It's possible to have interrupts off here:
*/
@@ -915,7 +895,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
- if (unlikely((error_code & PF_INSTR) &&
+ if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
@@ -928,7 +908,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* are always protection faults.
*/
if (address >= TASK_SIZE_MAX)
- error_code |= PF_PROT;
+ error_code |= X86_PF_PROT;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
@@ -989,11 +969,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
@@ -1021,7 +1001,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -1049,14 +1029,14 @@ static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, unsigned int fault)
{
- if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
@@ -1081,16 +1061,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
- if ((error_code & PF_WRITE) && !pte_write(*pte))
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
- if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set PF_PK.
+ * changes, so no spurious fault will ever set X86_PF_PK.
*/
- if ((error_code & PF_PK))
+ if ((error_code & X86_PF_PK))
return 1;
return 1;
@@ -1136,8 +1116,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
* change, so user accesses are not expected to cause spurious
* faults.
*/
- if (error_code != (PF_WRITE | PF_PROT)
- && error_code != (PF_INSTR | PF_PROT))
+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
@@ -1197,19 +1177,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
- * faults just to hit a PF_PK as soon as we fill in a
+ * faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return 1;
- if (error_code & PF_WRITE) {
+ if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
@@ -1217,7 +1197,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
}
/* read, present: */
- if (unlikely(error_code & PF_PROT))
+ if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
@@ -1240,7 +1220,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
if (!static_cpu_has(X86_FEATURE_SMAP))
return false;
- if (error_code & PF_USER)
+ if (error_code & X86_PF_USER)
return false;
if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1293,7 +1273,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
@@ -1321,7 +1301,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & PF_RSVD))
+ if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
@@ -1347,7 +1327,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= PF_USER;
+ error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1356,9 +1336,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
@@ -1378,7 +1358,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if ((error_code & PF_USER) == 0 &&
+ if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
@@ -1405,7 +1385,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
bad_area(regs, error_code, address);
return;
}
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
--
2.14.2

View File

@ -0,0 +1,104 @@
From e0cef0182f7d13edb48119653a4fc225b0287b5a Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Fri, 27 Oct 2017 13:25:29 -0700
Subject: [PATCH 084/231] x86/boot: Relocate definition of the initial state of
CR0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Both head_32.S and head_64.S utilize the same value to initialize the
control register CR0. Also, other parts of the kernel might want to access
this initial definition (e.g., emulation code for User-Mode Instruction
Prevention uses this state to provide a sane dummy value for CR0 when
emulating the smsw instruction). Thus, relocate this definition to a
header file from which it can be conveniently accessed.
Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: ricardo.neri@intel.com
Cc: linux-mm@kvack.org
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-arch@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com
(cherry picked from commit b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 27c31a88c22edab269abe17c0ac7db0351d26c5f)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/uapi/asm/processor-flags.h | 3 +++
arch/x86/kernel/head_32.S | 3 ---
arch/x86/kernel/head_64.S | 3 ---
3 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 185f3d10c194..39946d0a1d41 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -151,5 +151,8 @@
#define CX86_ARR_BASE 0xc4
#define CX86_RCR_BASE 0xdc
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 337a65377baf..7bbcdb1ea31a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -213,9 +213,6 @@ ENTRY(startup_32_smp)
#endif
.Ldefault_entry:
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a2d8541b1da4..4117c1e0b3d2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -137,9 +137,6 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
--
2.14.2

View File

@ -0,0 +1,93 @@
From 34b5c16ae093e5663c398c87569793bfbec1c7ca Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Fri, 27 Oct 2017 13:25:30 -0700
Subject: [PATCH 085/231] ptrace,x86: Make user_64bit_mode() available to
32-bit builds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
In its current form, user_64bit_mode() can only be used when CONFIG_X86_64
is selected. This implies that code built with CONFIG_X86_64=n cannot use
it. If a piece of code needs to be built for both CONFIG_X86_64=y and
CONFIG_X86_64=n and wants to use this function, it needs to wrap it in
an #ifdef/#endif; potentially, in multiple places.
This can be easily avoided with a single #ifdef/#endif pair within
user_64bit_mode() itself.
Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: ricardo.neri@intel.com
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Qiaowei Ren <qiaowei.ren@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Garnier <thgarnie@google.com>
Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com
(cherry picked from commit e27c310af5c05cf876d9cad006928076c27f54d4)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 20ddf08f867d3d96788299cd2fb7676590d64250)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/ptrace.h | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..ea78a8438a8a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -115,9 +115,9 @@ static inline int v8086_mode(struct pt_regs *regs)
#endif
}
-#ifdef CONFIG_X86_64
static inline bool user_64bit_mode(struct pt_regs *regs)
{
+#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT
/*
* On non-paravirt systems, this is the only long mode CPL 3
@@ -128,8 +128,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
/* Headers are too twisted for this to go in paravirt.h. */
return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
+#else /* !CONFIG_X86_64 */
+ return false;
+#endif
}
+#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
#endif
--
2.14.2

View File

@ -0,0 +1,75 @@
From befef5ef70f959cd51694298c4370557e5d846cf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:58:58 -0700
Subject: [PATCH 086/231] x86/entry/64: Remove the restore_c_regs_and_iret
label
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The only user was the 64-bit opportunistic SYSRET failure path, and
that path didn't really need it. This change makes the
opportunistic SYSRET code a bit more straightforward and gets rid of
the label.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 9da78ba6b47b46428cfdfc0851511ab29c869798)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 629c8b858cbe72e88e7f44a8f10e1b434ab80721)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 34adfe0221d2..fac354ddf056 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path:
call do_syscall_64 /* returns with IRQs disabled */
return_from_SYSCALL_64:
- RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */
/*
@@ -314,6 +313,7 @@ return_from_SYSCALL_64:
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
+ RESTORE_EXTRA_REGS
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
@@ -321,7 +321,7 @@ syscall_return_via_sysret:
opportunistic_sysret_failed:
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret
END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
@@ -638,7 +638,6 @@ retint_kernel:
*/
GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
--
2.14.2

View File

@ -0,0 +1,135 @@
From 6c390918ecf72524840bc174fb5f9d007db5a9a8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:58:59 -0700
Subject: [PATCH 087/231] x86/entry/64: Split the IRET-to-user and
IRET-to-kernel paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
These code paths will diverge soon.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 64adfba0aeb668304d171c383ac80b22158ec128)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++++---------
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
3 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fac354ddf056..e546441fbec3 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -321,7 +321,7 @@ syscall_return_via_sysret:
opportunistic_sysret_failed:
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
@@ -423,7 +423,7 @@ ENTRY(ret_from_fork)
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
1:
/* kernel thread */
@@ -612,7 +612,20 @@ GLOBAL(retint_user)
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
- jmp restore_regs_and_iret
+
+GLOBAL(restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates user mode. */
+ testl $3, CS(%rsp)
+ jnz 1f
+ ud2
+1:
+#endif
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+ INTERRUPT_RETURN
+
/* Returning to kernel space */
retint_kernel:
@@ -632,11 +645,14 @@ retint_kernel:
*/
TRACE_IRQS_IRETQ
-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates kernel mode. */
+ testl $3, CS(%rsp)
+ jz 1f
+ ud2
+1:
+#endif
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1340,7 +1356,7 @@ ENTRY(nmi)
* work, because we don't want to enable interrupts.
*/
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
.Lnmi_from_kernel:
/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index d8468ba24be0..2b3a88feaa2b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
END(entry_INT80_compat)
ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 4117c1e0b3d2..e785734980ad 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -311,7 +311,7 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_kernel
END(early_idt_handler_common)
__INITDATA
--
2.14.2

View File

@ -0,0 +1,156 @@
From 271bc7d0577bef9f344187eb45ba8682eed242f9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:00 -0700
Subject: [PATCH 088/231] x86/entry/64: Move SWAPGS into the common
IRET-to-usermode path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
All of the code paths that ended up doing IRET to usermode did
SWAPGS immediately beforehand. Move the SWAPGS into the common
code.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 8a055d7f411d41755ce30db5bb65b154777c4b78)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 62a85594f9be3baeb2495089f1c2980bc497d03b)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------
arch/x86/entry/entry_64_compat.S | 3 +--
2 files changed, 15 insertions(+), 20 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e546441fbec3..7c8258e3ad2d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -249,12 +249,14 @@ return_from_SYSCALL_64:
/*
* Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
+ * a completely clean 64-bit userspace context. If we're not,
+ * go to the slow exit path.
*/
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
- cmpq %rcx, %r11 /* RCX == RIP */
- jne opportunistic_sysret_failed
+
+ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -272,14 +274,14 @@ return_from_SYSCALL_64:
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@@ -300,12 +302,12 @@ return_from_SYSCALL_64:
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
+ jnz swapgs_restore_regs_and_return_to_usermode
/* nothing to check for RSP */
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode
/*
* We win! This label is here just for ease of understanding
@@ -318,10 +320,6 @@ syscall_return_via_sysret:
movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
USERGS_SYSRET64
-
-opportunistic_sysret_failed:
- SWAPGS
- jmp restore_regs_and_return_to_usermode
END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
@@ -422,8 +420,7 @@ ENTRY(ret_from_fork)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode
1:
/* kernel thread */
@@ -611,9 +608,8 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
- SWAPGS
-GLOBAL(restore_regs_and_return_to_usermode)
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
testl $3, CS(%rsp)
@@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode)
ud2
1:
#endif
+ SWAPGS
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1355,8 +1352,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode
.Lnmi_from_kernel:
/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 2b3a88feaa2b..be745b7a3e3e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode
END(entry_INT80_compat)
ALIGN
--
2.14.2

View File

@ -0,0 +1,103 @@
From 958fcb45b64535b87e3cfaef15a5cb41595e4187 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:01 -0700
Subject: [PATCH 089/231] x86/entry/64: Simplify reg restore code in the
standard IRET paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
The old code restored all the registers with movq instead of pop.
In theory, this was done because some CPUs have higher movq
throughput, but any gain there would be tiny and is almost certainly
outweighed by the higher text size.
This saves 96 bytes of text.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit e872045bfd9c465a8555bab4b8567d56a4d2d3bb)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit f926575cd370de4052e89477582b349af5664a56)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/calling.h | 21 +++++++++++++++++++++
arch/x86/entry/entry_64.S | 12 ++++++------
2 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 640aafebdc00..0b9dd8123701 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with
UNWIND_HINT_REGS offset=\offset extra=0
.endm
+ .macro POP_EXTRA_REGS
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ .endm
+
+ .macro POP_C_REGS
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ .endm
+
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
movq 6*8(%rsp), %r11
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7c8258e3ad2d..a1a86e782a0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
1:
#endif
SWAPGS
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
+ POP_EXTRA_REGS
+ POP_C_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
INTERRUPT_RETURN
@@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel)
ud2
1:
#endif
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
+ POP_EXTRA_REGS
+ POP_C_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
INTERRUPT_RETURN
ENTRY(native_iret)
--
2.14.2

View File

@ -0,0 +1,70 @@
From d8fdea47d7fc1177aa0843a49dc89422ac6f4fea Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:02 -0700
Subject: [PATCH 090/231] x86/entry/64: Shrink paranoid_exit_restore and make
labels local
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel.
Merge them and make the paranoid_exit internal labels local.
Keeping .Lparanoid_exit makes the code a bit shorter because it
allows a 2-byte jnz instead of a 5-byte jnz.
Saves 96 bytes of text.
( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS
kernel, but fixing that would make the code rather messy. )
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit e53178328c9b96fbdbc719e78c93b5687ee007c3)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit fb53fe10add935c3d0eb63199e43426eaf3b4299)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a1a86e782a0e..6995f7e08aa1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1136,17 +1136,14 @@ ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
+ jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
-paranoid_exit_no_swapgs:
+ jmp .Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
- INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+ jmp restore_regs_and_return_to_kernel
END(paranoid_exit)
/*
--
2.14.2

View File

@ -0,0 +1,61 @@
From d4b1f2361974bfffa04a528fb2ad393a55d13841 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:03 -0700
Subject: [PATCH 091/231] x86/entry/64: Use pop instead of movq in
syscall_return_via_sysret
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Saves 64 bytes.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 4fbb39108f972437c44e5ffa781b56635d496826)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 1e9a9d5ef9f65eeb26eb8f0974dd3e693894baf1)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6995f7e08aa1..33a416c7df2d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -315,10 +315,18 @@ return_from_SYSCALL_64:
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
UNWIND_HINT_EMPTY
+ POP_EXTRA_REGS
+ popq %rsi /* skip r11 */
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rsi /* skip rcx */
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ movq RSP-ORIG_RAX(%rsp), %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
--
2.14.2

View File

@ -0,0 +1,60 @@
From c48697564de6da427f1e97a38192f4d456223942 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:04 -0700
Subject: [PATCH 092/231] x86/entry/64: Merge the fast and slow SYSRET paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
They did almost the same thing. Remove a bunch of pointless
instructions (mostly hidden in macros) and reduce cognitive load by
merging them.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit a512210643da8082cb44181dba8b18e752bd68f0)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 7c4575d8bb2d01960ba9b9840fa22460e0179eca)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 33a416c7df2d..87be1cd1fa88 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath:
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
+ addq $6*8, %rsp /* skip extra regs -- they were preserved */
UNWIND_HINT_EMPTY
- USERGS_SYSRET64
+ jmp .Lpop_c_regs_except_rcx_r11_and_sysret
1:
/*
@@ -317,6 +316,7 @@ syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
UNWIND_HINT_EMPTY
POP_EXTRA_REGS
+.Lpop_c_regs_except_rcx_r11_and_sysret:
popq %rsi /* skip r11 */
popq %r10
popq %r9
--
2.14.2

View File

@ -0,0 +1,57 @@
From c801c4e1ba695ba230e97f626abaeb0c16393b09 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:05 -0700
Subject: [PATCH 093/231] x86/entry/64: Use POP instead of MOV to restore regs
on NMI return
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This gets rid of the last user of the old RESTORE_..._REGS infrastructure.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 471ee4832209e986029b9fabdaad57b1eecb856b)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 3c5771a43d8f00e53081871027fea891a091ff5e)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 87be1cd1fa88..4eff3aca54ed 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1572,11 +1572,14 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
+ POP_EXTRA_REGS
+ POP_C_REGS
- /* Point RSP at the "iret" frame. */
- REMOVE_PT_GPREGS_FROM_STACK 6*8
+ /*
+ * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+ * at the "iret" frame.
+ */
+ addq $6*8, %rsp
/*
* Clear "NMI executing". Set DF first so that we can easily
--
2.14.2

View File

@ -0,0 +1,105 @@
From 8837585aa116d1aa832e524442a1e9953d17a196 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:06 -0700
Subject: [PATCH 094/231] x86/entry/64: Remove the RESTORE_..._REGS
infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and
REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit c39858de696f0cc160a544455e8403d663d577e9)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit d248c62028c5467cd5a5ce06d344e3fb330da3ec)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/calling.h | 52 ------------------------------------------------
1 file changed, 52 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 0b9dd8123701..1895a685d3dd 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with
UNWIND_HINT_REGS offset=\offset
.endm
- .macro RESTORE_EXTRA_REGS offset=0
- movq 0*8+\offset(%rsp), %r15
- movq 1*8+\offset(%rsp), %r14
- movq 2*8+\offset(%rsp), %r13
- movq 3*8+\offset(%rsp), %r12
- movq 4*8+\offset(%rsp), %rbp
- movq 5*8+\offset(%rsp), %rbx
- UNWIND_HINT_REGS offset=\offset extra=0
- .endm
-
.macro POP_EXTRA_REGS
popq %r15
popq %r14
@@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with
popq %rdi
.endm
- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
- .if \rstor_r11
- movq 6*8(%rsp), %r11
- .endif
- .if \rstor_r8910
- movq 7*8(%rsp), %r10
- movq 8*8(%rsp), %r9
- movq 9*8(%rsp), %r8
- .endif
- .if \rstor_rax
- movq 10*8(%rsp), %rax
- .endif
- .if \rstor_rcx
- movq 11*8(%rsp), %rcx
- .endif
- .if \rstor_rdx
- movq 12*8(%rsp), %rdx
- .endif
- movq 13*8(%rsp), %rsi
- movq 14*8(%rsp), %rdi
- UNWIND_HINT_IRET_REGS offset=16*8
- .endm
- .macro RESTORE_C_REGS
- RESTORE_C_REGS_HELPER 1,1,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RAX
- RESTORE_C_REGS_HELPER 0,1,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RCX
- RESTORE_C_REGS_HELPER 1,0,1,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_R11
- RESTORE_C_REGS_HELPER 1,1,0,1,1
- .endm
- .macro RESTORE_C_REGS_EXCEPT_RCX_R11
- RESTORE_C_REGS_HELPER 1,0,0,1,1
- .endm
-
- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
- subq $-(15*8+\addskip), %rsp
- .endm
-
.macro icebp
.byte 0xf1
.endm
--
2.14.2

View File

@ -0,0 +1,105 @@
From 47b64e9de8bba4e6ccd0976bce6cf99446daf82e Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Nov 2017 00:59:07 -0700
Subject: [PATCH 095/231] xen, x86/entry/64: Add xen NMI trap entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Instead of trying to execute any NMI via the bare metal's NMI trap
handler use a Xen specific one for PV domains, like we do for e.g.
debug traps. As in a PV domain the NMI is handled via the normal
kernel stack this is the correct thing to do.
This will enable us to get rid of the very fragile and questionable
dependencies between the bare metal NMI handler and Xen assumptions
believed to be broken anyway.
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 43e4111086a70c78bedb6ad990bee97f17b27a6e)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 20c970e03b42141abf6c45938ce6d4fdc3555921)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/traps.h | 2 +-
arch/x86/xen/enlighten_pv.c | 2 +-
arch/x86/entry/entry_64.S | 2 +-
arch/x86/xen/xen-asm_64.S | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 8e5bf86f87e5..b052a7621ca1 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -55,9 +55,9 @@ asmlinkage void simd_coprocessor_error(void);
#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
asmlinkage void xen_xendebug(void);
asmlinkage void xen_xenint3(void);
-asmlinkage void xen_nmi(void);
asmlinkage void xen_overflow(void);
asmlinkage void xen_bounds(void);
asmlinkage void xen_invalid_op(void);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 69b9deff7e5c..8da4eff19c2a 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = {
#ifdef CONFIG_X86_MCE
{ machine_check, xen_machine_check, true },
#endif
- { nmi, xen_nmi, true },
+ { nmi, xen_xennmi, true },
{ overflow, xen_overflow, false },
#ifdef CONFIG_IA32_EMULATION
{ entry_INT80_compat, xen_entry_INT80_compat, false },
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4eff3aca54ed..5a6aba7cf3bd 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1091,6 +1091,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1
#ifdef CONFIG_XEN
+idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
#endif
@@ -1253,7 +1254,6 @@ ENTRY(error_exit)
END(error_exit)
/* Runs on exception stack */
-/* XXX: broken on Xen PV */
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
/*
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index dae2cc33afb5..286ecc198562 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -29,7 +29,7 @@ xen_pv_trap debug
xen_pv_trap xendebug
xen_pv_trap int3
xen_pv_trap xenint3
-xen_pv_trap nmi
+xen_pv_trap xennmi
xen_pv_trap overflow
xen_pv_trap bounds
xen_pv_trap invalid_op
--
2.14.2

View File

@ -0,0 +1,117 @@
From 4a112915e611296f0d196bb6cb2baa99af0e9148 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:08 -0700
Subject: [PATCH 096/231] x86/entry/64: De-Xen-ify our NMI code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
Xen PV is fundamentally incompatible with our fancy NMI code: it
doesn't use IST at all, and Xen entries clobber two stack slots
below the hardware frame.
Drop Xen PV support from our NMI code entirely.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 929bacec21478a72c78e4f29f98fb799bd00105a)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit ffc372909c1701c4fdd2bde7861692573ef381a7)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 5a6aba7cf3bd..05501c781c20 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1253,9 +1253,13 @@ ENTRY(error_exit)
jmp retint_user
END(error_exit)
-/* Runs on exception stack */
+/*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
+
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
@@ -1313,7 +1317,7 @@ ENTRY(nmi)
* stacks lest we corrupt the "NMI executing" variable.
*/
- SWAPGS_UNSAFE_STACK
+ swapgs
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1478,7 +1482,7 @@ nested_nmi_out:
popq %rdx
/* We are returning to kernel mode, so this cannot result in a fault. */
- INTERRUPT_RETURN
+ iretq
first_nmi:
/* Restore rdx. */
@@ -1509,7 +1513,7 @@ first_nmi:
pushfq /* RFLAGS */
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
- INTERRUPT_RETURN /* continues at repeat_nmi below */
+ iretq /* continues at repeat_nmi below */
UNWIND_HINT_IRET_REGS
1:
#endif
@@ -1584,20 +1588,22 @@ nmi_restore:
/*
* Clear "NMI executing". Set DF first so that we can easily
* distinguish the remaining code between here and IRET from
- * the SYSCALL entry and exit paths. On a native kernel, we
- * could just inspect RIP, but, on paravirt kernels,
- * INTERRUPT_RETURN can translate into a jump into a
- * hypercall page.
+ * the SYSCALL entry and exit paths.
+ *
+ * We arguably should just inspect RIP instead, but I (Andy) wrote
+ * this code when I had the misapprehension that Xen PV supported
+ * NMIs, and Xen PV would break that approach.
*/
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
- * stack in a single instruction. We are returning to kernel
- * mode, so this cannot result in a fault.
+ * iretq reads the "iret" frame and exits the NMI stack in a
+ * single instruction. We are returning to kernel mode, so this
+ * cannot result in a fault. Similarly, we don't need to worry
+ * about espfix64 on the way back to kernel mode.
*/
- INTERRUPT_RETURN
+ iretq
END(nmi)
ENTRY(ignore_sysret)
--
2.14.2

View File

@ -0,0 +1,145 @@
From f16330a748c8b8db495673108d72fcfc2873d377 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:09 -0700
Subject: [PATCH 097/231] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update
code out of native_load_sp0()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This causes the MSR_IA32_SYSENTER_CS write to move out of the
paravirt callback. This shouldn't affect Xen PV: Xen already ignores
MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support
vm86() in a useful way.
Note to any potential backporters: This patch won't break lguest, as
lguest didn't have any SYSENTER support at all.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit bd7dc5a6afac719d8ce4092391eef2c7e83c2a75)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 779e32d0da9a547f3b11fbecac8287e458ba67f5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/processor.h | 7 -------
arch/x86/include/asm/switch_to.h | 12 ++++++++++++
arch/x86/kernel/process_32.c | 4 +++-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/vm86_32.c | 6 +++++-
5 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 028245e1c42b..ee37fb86900a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -513,13 +513,6 @@ static inline void
native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
{
tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
- /* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
-#endif
}
static inline void native_swapgs(void)
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index fcc5cd387fd1..7ae8caffbada 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -72,4 +72,16 @@ do { \
((last) = __switch_to_asm((prev), (next))); \
} while (0)
+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 22802162eeb9..2e42b66b8ca4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
- * current_thread_info().
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
*/
load_sp0(tss, next);
+ refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1e7701c4cd80..565daaa6f18d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);
- /* Reload esp0 and ss1. This changes current_thread_info(). */
+ /* Reload sp0. */
load_sp0(tss, next);
/*
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 7924a5356c8a..5bc1c3ab6287 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -54,6 +54,7 @@
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
@@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &tsk->thread);
+ refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
put_cpu();
@@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
/* make room for real-mode segments */
tsk->thread.sp0 += 16;
- if (static_cpu_has(X86_FEATURE_SEP))
+ if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
load_sp0(tss, &tsk->thread);
put_cpu();
--
2.14.2

View File

@ -0,0 +1,238 @@
From 3868ecbc68a9713951f3008ef3af3b9da7e67e60 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:10 -0700
Subject: [PATCH 098/231] x86/entry/64: Pass SP0 directly to load_sp0()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
load_sp0() had an odd signature:
void load_sp0(struct tss_struct *tss, struct thread_struct *thread);
Simplify it to:
void load_sp0(unsigned long sp0);
Also simplify a few get_cpu()/put_cpu() sequences to
preempt_disable()/preempt_enable().
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit da51da189a24bb9b7e2d5a123be096e51a4695a5)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 41f6a89b0be4d052a6af59df5e56102d4e4c79ef)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/paravirt.h | 5 ++---
arch/x86/include/asm/paravirt_types.h | 2 +-
arch/x86/include/asm/processor.h | 9 ++++-----
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/vm86_32.c | 14 ++++++--------
arch/x86/xen/enlighten_pv.c | 7 +++----
8 files changed, 20 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 12deec722cf0..43d4f90edebc 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,10 +15,9 @@
#include <linux/cpumask.h>
#include <asm/frame.h>
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 42873edd9f9d..e3953a1e2b57 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -133,7 +133,7 @@ struct pv_cpu_ops {
void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+ void (*load_sp0)(unsigned long sp0);
void (*set_iopl_mask)(unsigned mask);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ee37fb86900a..85ddfc1a9bb5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -510,9 +510,9 @@ static inline void native_set_iopl_mask(unsigned mask)
}
static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
{
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@@ -537,10 +537,9 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- native_load_sp0(tss, thread);
+ native_load_sp0(sp0);
}
#define set_iopl_mask native_set_iopl_mask
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ef7b1ba56363..6562acbfc4e0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1570,7 +1570,7 @@ void cpu_init(void)
BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
- load_sp0(t, &current->thread);
+ load_sp0(current->thread.sp0);
set_tss_desc(cpu, t);
load_TR_desc();
load_mm_ldt(&init_mm);
@@ -1624,7 +1624,7 @@ void cpu_init(void)
BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
- load_sp0(t, thread);
+ load_sp0(thread->sp0);
set_tss_desc(cpu, t);
load_TR_desc();
load_mm_ldt(&init_mm);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2e42b66b8ca4..48a3f240f565 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
- load_sp0(tss, next);
+ load_sp0(next->sp0);
refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 565daaa6f18d..37b933628a8b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -466,7 +466,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);
/* Reload sp0. */
- load_sp0(tss, next);
+ load_sp0(next->sp0);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5bc1c3ab6287..0f1d92cd20ad 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -94,7 +94,6 @@
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
@@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV);
}
- tss = &per_cpu(cpu_tss, get_cpu());
+ preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &tsk->thread);
+ load_sp0(tsk->thread.sp0);
refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
- put_cpu();
+ preempt_enable();
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
@@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
@@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
- tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
+ preempt_disable();
tsk->thread.sp0 += 16;
if (static_cpu_has(X86_FEATURE_SEP)) {
@@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
refresh_sysenter_cs(&tsk->thread);
}
- load_sp0(tss, &tsk->thread);
- put_cpu();
+ load_sp0(tsk->thread.sp0);
+ preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 8da4eff19c2a..e7b213047724 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
}
}
-static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
{
struct multicall_space mcs;
mcs = xen_mc_entry(0);
- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
--
2.14.2

View File

@ -0,0 +1,48 @@
From 181d224dabca9a9061a6955cf3d49a4eba7294bf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:11 -0700
Subject: [PATCH 099/231] x86/entry: Add task_top_of_stack() to find the top of
a task's stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
This will let us get rid of a few places that hardcode accesses to
thread.sp0.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 3500130b84a3cdc5b6796eba1daf178944935efe)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit f1078e10e361afaeb22ee72c54d5ad397e19728d)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/include/asm/processor.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 85ddfc1a9bb5..f83fbf1b6dd9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -788,6 +788,8 @@ static inline void spin_lock_prefetch(const void *x)
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
--
2.14.2

View File

@ -0,0 +1,99 @@
From 232ab20b1af958a04a82fb7290a1e54c3632f771 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 2 Nov 2017 00:59:12 -0700
Subject: [PATCH 100/231] x86/xen/64, x86/entry/64: Clean up SP code in
cpu_initialize_context()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CVE-2017-5754
I'm removing thread_struct::sp0, and Xen's usage of it is slightly
dubious and unnecessary. Use appropriate helpers instead.
While we're at at, reorder the code slightly to make it more obvious
what's going on.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit f16b3da1dc936c0f8121741d0a1731bf242f2f56)
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
(cherry picked from commit 27c60a1f6c49062151f67042458a523386cc3dc5)
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
---
arch/x86/xen/smp_pv.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 51471408fdd1..8c0e047d0b80 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -13,6 +13,7 @@
* single-threaded.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/smp.h>
@@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
#endif
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+ /*
+ * Bring up the CPU in cpu_bringup_and_idle() with the stack
+ * pointing just below where pt_regs would be if it were a normal
+ * kernel entry.
+ */
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
xen_copy_trap_info(ctxt->trap_ctxt);
@@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
+ /*
+ * Set SS:SP that Xen will use when entering guest kernel mode
+ * from guest user mode. Subsequent calls to load_sp0() can
+ * change this value.
+ */
ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_sp = task_top_of_stack(idle);
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
@@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
(unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
- ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
BUG();
--
2.14.2

Some files were not shown because too many files have changed in this diff Show More