From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: speck@linutronix.de
Subject: [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
Date: Wed, 20 Jun 2018 16:57:02 -0400 [thread overview]
Message-ID: <20180620205701.GD28309@char.us.oracle.com> (raw)
In-Reply-To: <20180620204352.066357420@localhost.localdomain>
On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
>
> Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> If it is active, the displacement flush is unnecessary. Tested on
> a Coffee Lake machine.
ooops, that nice commit description you wrote Paolo got messed up. Sorry!
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
> arch/x86/include/asm/kvm_host.h | 7 ++++-
> arch/x86/kvm/mmu.c | 1 +
> arch/x86/kvm/svm.c | 3 ++-
> arch/x86/kvm/vmx.c | 46 +++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 111 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c13cd28d9d1b..7d7626cffc21 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
>
> /* be preempted when it's in kernel-mode(cpl=0) */
> bool preempted_in_kernel;
> +
There is a giant space right above here.. And I believe Paolo's original patch had it too.
> + /* for L1 terminal fault vulnerability */
> + bool vcpu_unconfined;
> };
>
> struct kvm_lpage_info {
> @@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
> u64 signal_exits;
> u64 irq_window_exits;
> u64 nmi_window_exits;
> + u64 l1d_flush;
> u64 halt_exits;
> u64 halt_successful_poll;
> u64 halt_attempted_poll;
> @@ -939,7 +943,7 @@ struct kvm_x86_ops {
> void (*vcpu_free)(struct kvm_vcpu *vcpu);
> void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>
> - void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
> + void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
> void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
> void (*vcpu_put)(struct kvm_vcpu *vcpu);
>
> @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>
> void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
> struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>
> static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
> {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d594690d8b95..4d4e3dc2494e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
> {
> int r = 1;
>
> + vcpu->arch.vcpu_unconfined = true;
> switch (vcpu->arch.apf.host_apf_reason) {
> default:
> trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f059a73f0fd0..302afb2643fa 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
> svm->asid_generation--;
> }
>
> -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> {
> + *need_l1d_flush = false;
> }
>
> static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..ba83d0cb9b03 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
> };
> MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
> +
> static bool __read_mostly enable_vpid = 1;
> module_param_named(vpid, enable_vpid, bool, 0444);
>
> @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
> vmx->guest_msrs[i].mask);
> }
>
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> + vmx_save_host_state(vcpu);
> + if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {
and
|| !boot_cpu_has(X86_BUG_L1TF)
> + *need_l1d_flush = false;
> + return;
> + }
> +
> + switch (vmentry_l1d_flush) {
> + case 0:
> + *need_l1d_flush = false;
> + break;
> + case 1:
> + /*
> + * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> + * setting vcpu->arch.vcpu_unconfined. Currently this happens in the
> + * following cases:
> + * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> + * nested hypervisor *and* by KVM on bare metal, so we just do it
> + * on every nested entry. Nested hypervisors do not bother clearing
> + * the cache.
> + * - anything that runs the emulator (the slow paths for EPT misconfig
> + * or I/O instruction)
> + * - anything that can cause get_user_pages (EPT violation, and again
> + * the slow paths for EPT misconfig or I/O instruction)
> + * - anything that can run code outside KVM (external interrupt,
> + * which can run interrupt handlers or irqs; or the sched_in
> + * preempt notifier)
> + */
> + break;
> + case 2:
> + default:
> + *need_l1d_flush = true;
> + break;
> + }
> +}
> +
> static void __vmx_load_host_state(struct vcpu_vmx *vmx)
> {
> if (!vmx->host_state.loaded)
> @@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
> [ss]"i"(__KERNEL_DS),
> [cs]"i"(__KERNEL_CS)
> );
> + vcpu->arch.vcpu_unconfined = true;
> }
> }
> STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
> return ret;
> }
>
> + /* Hide L1D cache contents from the nested guest. */
> + vmx->vcpu.arch.vcpu_unconfined = true;
> +
> /*
> * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
> * by event injection, halt vcpu.
> @@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> .vcpu_free = vmx_free_vcpu,
> .vcpu_reset = vmx_vcpu_reset,
>
> - .prepare_guest_switch = vmx_save_host_state,
> + .prepare_guest_switch = vmx_prepare_guest_switch,
> .vcpu_load = vmx_vcpu_load,
> .vcpu_put = vmx_vcpu_put,
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1065d4e7c5fd..c0079aae38ba 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
> { "irq_injections", VCPU_STAT(irq_injections) },
> { "nmi_injections", VCPU_STAT(nmi_injections) },
> { "req_event", VCPU_STAT(req_event) },
> + { "l1d_flush", VCPU_STAT(l1d_flush) },
> { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
> { "mmu_pte_write", VM_STAT(mmu_pte_write) },
> { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
> bool writeback = true;
> bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>
> + vcpu->arch.vcpu_unconfined = true;
> +
> /*
> * Clear write_fault_to_shadow_pgtable here to ensure it is
> * never reused.
> @@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
> };
> #endif
>
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> + /* FIXME: could this be boot_cpu_data.x86_cache_size * 2? */
> + int size = PAGE_SIZE << L1D_CACHE_ORDER;
I am thinking we should have:
ASSERT(boot_cpu_has(X86_BUG_L1TF));
or such?
> + asm volatile(
> + /* First ensure the pages are in the TLB */
> + "xorl %%eax, %%eax\n\t"
> + "11: \n\t"
> + "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> + "addl $4096, %%eax\n\t"
> + "cmpl %%eax, %1\n\t"
> + "jne 11b\n\t"
> + "xorl %%eax, %%eax\n\t"
> + "cpuid\n\t"
> + /* Now fill the cache */
> + "xorl %%eax, %%eax\n\t"
> + "12:\n\t"
> + "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> + "addl $64, %%eax\n\t"
> + "cmpl %%eax, %1\n\t"
> + "jne 12b\n\t"
> + "lfence\n\t"
> + : : "r" (empty_zero_pages), "r" (size)
> + : "eax", "ebx", "ecx", "edx");
> +}
> +
> int kvm_arch_init(void *opaque)
> {
> int r;
> struct kvm_x86_ops *ops = opaque;
> + struct page *page;
>
> if (kvm_x86_ops) {
> printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
> "being able to snoop the host memory!");
> }
> r = -ENOMEM;
> + page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> + if (!page)
> + goto out;
> + empty_zero_pages = page_address(page);
> +
> shared_msrs = alloc_percpu(struct kvm_shared_msrs);
> if (!shared_msrs) {
> printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> - goto out;
> + goto out_free_zero_pages;
> }
>
> r = kvm_mmu_module_init();
> @@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
>
> return 0;
>
> +out_free_zero_pages:
> + free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
> out_free_percpu:
> free_percpu(shared_msrs);
> out:
> @@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
> #endif
> kvm_x86_ops = NULL;
> kvm_mmu_module_exit();
> + free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
> free_percpu(shared_msrs);
> }
>
> @@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> kvm_cpu_accept_dm_intr(vcpu);
>
> bool req_immediate_exit = false;
> + bool need_l1d_flush;
>
> if (kvm_request_pending(vcpu)) {
> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> @@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
> preempt_disable();
>
> - kvm_x86_ops->prepare_guest_switch(vcpu);
> + need_l1d_flush = vcpu->arch.vcpu_unconfined;
> + vcpu->arch.vcpu_unconfined = false;
> + kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
> + if (need_l1d_flush) {
> + vcpu->stat.l1d_flush++;
> + kvm_l1d_flush();
> + }
I was thinking that this kvm_l1d_flush should really happen right before
you an VMENTER. I was thinking to change your patch for this exact reason
but I am curious why you put it here?
>
> /*
> * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
> @@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
> struct kvm *kvm = vcpu->kvm;
>
> vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> + vcpu->arch.vcpu_unconfined = true;
>
> for (;;) {
> if (kvm_vcpu_running(vcpu)) {
> @@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>
> void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
> {
> + vcpu->arch.vcpu_unconfined = true;
> kvm_x86_ops->sched_in(vcpu, cpu);
> }
>
> --
> 2.14.3
>
>
next prev parent reply other threads:[~2018-06-20 20:57 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-06-20 20:43 [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5 konrad.wilk
2018-06-20 20:57 ` Konrad Rzeszutek Wilk [this message]
2018-06-21 3:17 ` [MODERATED] " Konrad Rzeszutek Wilk
2018-06-21 14:04 ` Konrad Rzeszutek Wilk
2018-06-21 17:29 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180620205701.GD28309@char.us.oracle.com \
--to=konrad.wilk@oracle.com \
--cc=speck@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).