[MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5

Historical speck list archives
 help / color / mirror / Atom feed

From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
To: speck@linutronix.de
Subject: [MODERATED] Re: [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5
Date: Wed, 20 Jun 2018 16:57:02 -0400	[thread overview]
Message-ID: <20180620205701.GD28309@char.us.oracle.com> (raw)
In-Reply-To: <20180620204352.066357420@localhost.localdomain>

On Wed, Jun 20, 2018 at 04:43:01PM -0400, speck for konrad.wilk_at_oracle.com wrote:
> kvm: x86: mitigation for L1 cache terminal fault vulnerabilities
> 
> Intel's new microcode adds a new feature bit in CPUID[7,0].EDX[28].
> If it is active, the displacement flush is unnecessary.  Tested on
> a Coffee Lake machine.

ooops, that nice commit description you wrote Paolo got messed up. Sorry!

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 ++++-
>  arch/x86/kvm/mmu.c              |  1 +
>  arch/x86/kvm/svm.c              |  3 ++-
>  arch/x86/kvm/vmx.c              | 46 +++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              | 59 +++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 111 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c13cd28d9d1b..7d7626cffc21 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -713,6 +713,9 @@ struct kvm_vcpu_arch {
>  
>  	/* be preempted when it's in kernel-mode(cpl=0) */
>  	bool preempted_in_kernel;
> +	

There is a giant space right above here.. And I believe Paolo's original patch had it too.

> +	/* for L1 terminal fault vulnerability */
> +	bool vcpu_unconfined;
>  };
>  
>  struct kvm_lpage_info {
> @@ -881,6 +884,7 @@ struct kvm_vcpu_stat {
>  	u64 signal_exits;
>  	u64 irq_window_exits;
>  	u64 nmi_window_exits;
> +	u64 l1d_flush;
>  	u64 halt_exits;
>  	u64 halt_successful_poll;
>  	u64 halt_attempted_poll;
> @@ -939,7 +943,7 @@ struct kvm_x86_ops {
>  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
>  	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
>  
> -	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
> +	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu, bool *need_l1d_flush);
>  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
>  	void (*vcpu_put)(struct kvm_vcpu *vcpu);
>  
> @@ -1449,6 +1453,7 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  
>  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
>  		     struct kvm_lapic_irq *irq);
> +void kvm_l1d_flush(void);
>  
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d594690d8b95..4d4e3dc2494e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
>  {
>  	int r = 1;
>  
> +	vcpu->arch.vcpu_unconfined = true;
>  	switch (vcpu->arch.apf.host_apf_reason) {
>  	default:
>  		trace_kvm_page_fault(fault_address, error_code);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index f059a73f0fd0..302afb2643fa 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5435,8 +5435,9 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
>  		svm->asid_generation--;
>  }
>  
> -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
> +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
>  {
> +	*need_l1d_flush = false;
>  }
>  
>  static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 559a12b6184d..ba83d0cb9b03 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
>  };
>  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
>  
> +static int __read_mostly vmentry_l1d_flush = 1;
> +module_param_named(vmentry_l1d_flush, vmentry_l1d_flush, int, 0644);
> +
>  static bool __read_mostly enable_vpid = 1;
>  module_param_named(vpid, enable_vpid, bool, 0444);
>  
> @@ -2618,6 +2621,43 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>  				   vmx->guest_msrs[i].mask);
>  }
>  
> +static void vmx_prepare_guest_switch(struct kvm_vcpu *vcpu, bool *need_l1d_flush)
> +{
> +	vmx_save_host_state(vcpu);
> +	if (!enable_ept || static_cpu_has(X86_FEATURE_HYPERVISOR)) {

and 
	|| !boot_cpu_has(X86_BUG_L1TF)

> +		*need_l1d_flush = false;
> +		return;
> +	}
> +

> +	switch (vmentry_l1d_flush) {
> +	case 0:
> +		*need_l1d_flush = false;
> +		break;
> +	case 1:
> +		/*
> +		 * If vmentry_l1d_flush is 1, each vmexit handler is responsible for
> +		 * setting vcpu->arch.vcpu_unconfined.  Currently this happens in the
> +		 * following cases:
> +		 * - vmlaunch/vmresume: we do not want the cache to be cleared by a
> +		 *   nested hypervisor *and* by KVM on bare metal, so we just do it
> +		 *   on every nested entry.  Nested hypervisors do not bother clearing
> +		 *   the cache.
> +		 * - anything that runs the emulator (the slow paths for EPT misconfig
> +		 *   or I/O instruction)
> +		 * - anything that can cause get_user_pages (EPT violation, and again
> +		 *   the slow paths for EPT misconfig or I/O instruction)
> +		 * - anything that can run code outside KVM (external interrupt,
> +		 *   which can run interrupt handlers or irqs; or the sched_in
> +		 *   preempt notifier)
> +		 */
> +		break;
> +	case 2:
> +	default:
> +		*need_l1d_flush = true;
> +		break;
> +	}
> +}
> +
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>  {
>  	if (!vmx->host_state.loaded)
> @@ -9751,6 +9791,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
>  			[ss]"i"(__KERNEL_DS),
>  			[cs]"i"(__KERNEL_CS)
>  			);
> +		vcpu->arch.vcpu_unconfined = true;
>  	}
>  }
>  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
> @@ -11811,6 +11852,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return ret;
>  	}
>  
> +	/* Hide L1D cache contents from the nested guest.  */
> +	vmx->vcpu.arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>  	 * by event injection, halt vcpu.
> @@ -12928,7 +12972,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.vcpu_free = vmx_free_vcpu,
>  	.vcpu_reset = vmx_vcpu_reset,
>  
> -	.prepare_guest_switch = vmx_save_host_state,
> +	.prepare_guest_switch = vmx_prepare_guest_switch,
>  	.vcpu_load = vmx_vcpu_load,
>  	.vcpu_put = vmx_vcpu_put,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1065d4e7c5fd..c0079aae38ba 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -199,6 +199,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
>  	{ "irq_injections", VCPU_STAT(irq_injections) },
>  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
>  	{ "req_event", VCPU_STAT(req_event) },
> +	{ "l1d_flush", VCPU_STAT(l1d_flush) },
>  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
>  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
>  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
> @@ -6054,6 +6055,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>  	bool writeback = true;
>  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
>  
> +	vcpu->arch.vcpu_unconfined = true;
> +
>  	/*
>  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
>  	 * never reused.
> @@ -6537,10 +6540,45 @@ static struct notifier_block pvclock_gtod_notifier = {
>  };
>  #endif
>  
> +
> +/*
> + * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
> + * 64 KiB because the replacement algorithm is not exactly LRU.
> + */
> +#define L1D_CACHE_ORDER 4
> +static void *__read_mostly empty_zero_pages;
> +
> +void kvm_l1d_flush(void)
> +{
> +	/* FIXME: could this be boot_cpu_data.x86_cache_size * 2?  */
> +	int size = PAGE_SIZE << L1D_CACHE_ORDER;

I am thinking we should have:

	ASSERT(boot_cpu_has(X86_BUG_L1TF));

or such?

> +	asm volatile(
> +		/* First ensure the pages are in the TLB */
> +		"xorl %%eax, %%eax\n\t"
> +		"11: \n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $4096, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 11b\n\t"
> +		"xorl %%eax, %%eax\n\t"
> +		"cpuid\n\t"
> +		/* Now fill the cache */
> +		"xorl %%eax, %%eax\n\t"
> +		"12:\n\t"
> +		"movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
> +		"addl $64, %%eax\n\t"
> +		"cmpl %%eax, %1\n\t"
> +		"jne 12b\n\t"
> +		"lfence\n\t"
> +		: : "r" (empty_zero_pages), "r" (size)
> +		: "eax", "ebx", "ecx", "edx");
> +}
> +
>  int kvm_arch_init(void *opaque)
>  {
>  	int r;
>  	struct kvm_x86_ops *ops = opaque;
> +	struct page *page;
>  
>  	if (kvm_x86_ops) {
>  		printk(KERN_ERR "kvm: already loaded the other module\n");
> @@ -6569,10 +6607,15 @@ int kvm_arch_init(void *opaque)
>  				"being able to snoop the host memory!");
>  	}
>  	r = -ENOMEM;
> +	page = alloc_pages(GFP_ATOMIC, L1D_CACHE_ORDER);
> +	if (!page)
> +		goto out;
> +	empty_zero_pages = page_address(page);
> +
>  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
>  	if (!shared_msrs) {
>  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
> -		goto out;
> +		goto out_free_zero_pages;
>  	}
>  
>  	r = kvm_mmu_module_init();
> @@ -6603,6 +6646,8 @@ int kvm_arch_init(void *opaque)
>  
>  	return 0;
>  
> +out_free_zero_pages:
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  out_free_percpu:
>  	free_percpu(shared_msrs);
>  out:
> @@ -6627,6 +6672,7 @@ void kvm_arch_exit(void)
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> +	free_pages((unsigned long)empty_zero_pages, L1D_CACHE_ORDER);
>  	free_percpu(shared_msrs);
>  }
>  
> @@ -7266,6 +7312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  		kvm_cpu_accept_dm_intr(vcpu);
>  
>  	bool req_immediate_exit = false;
> +	bool need_l1d_flush;
>  
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> @@ -7405,7 +7452,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	preempt_disable();
>  
> -	kvm_x86_ops->prepare_guest_switch(vcpu);
> +	need_l1d_flush = vcpu->arch.vcpu_unconfined;
> +	vcpu->arch.vcpu_unconfined = false;
> +	kvm_x86_ops->prepare_guest_switch(vcpu, &need_l1d_flush);
> +	if (need_l1d_flush) {
> +		vcpu->stat.l1d_flush++;
> +		kvm_l1d_flush();
> +	}

I was thinking that this kvm_l1d_flush should really happen right before 
you an VMENTER. I was thinking to change your patch for this exact reason
but I am curious why you put it here?

>  
>  	/*
>  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7592,6 +7645,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  	struct kvm *kvm = vcpu->kvm;
>  
>  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
> +	vcpu->arch.vcpu_unconfined = true;
>  
>  	for (;;) {
>  		if (kvm_vcpu_running(vcpu)) {
> @@ -8711,6 +8765,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +	vcpu->arch.vcpu_unconfined = true;
>  	kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>  
> -- 
> 2.14.3
> 
>

next prev parent reply	other threads:[~2018-06-20 20:57 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-20 20:43 [MODERATED] [PATCH v2.1 5/6] [PATCH v2.1 5/6] Patch #5 konrad.wilk
2018-06-20 20:57 ` Konrad Rzeszutek Wilk [this message]
2018-06-21  3:17   ` [MODERATED] " Konrad Rzeszutek Wilk
2018-06-21 14:04     ` Konrad Rzeszutek Wilk
2018-06-21 17:29       ` Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180620205701.GD28309@char.us.oracle.com \
    --to=konrad.wilk@oracle.com \
    --cc=speck@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).