KVM Archive mirror
 help / color / mirror / Atom feed
From: Oliver Upton <oliver.upton@linux.dev>
To: Marc Zyngier <maz@kernel.org>
Cc: kvmarm@lists.linux.dev, kvm@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	James Morse <james.morse@arm.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	Zenghui Yu <yuzenghui@huawei.com>,
	Joey Gouly <joey.gouly@arm.com>,
	Alexandru Elisei <alexandru.elisei@arm.com>,
	Christoffer Dall <christoffer.dall@arm.com>
Subject: Re: [PATCH 01/16] KVM: arm64: nv: Support multiple nested Stage-2 mmu structures
Date: Tue, 7 May 2024 06:17:13 +0000	[thread overview]
Message-ID: <ZjnHaYjgMqcpfxdV@linux.dev> (raw)
In-Reply-To: <20240409175448.3507472-2-maz@kernel.org>

Hey Marc,

On Tue, Apr 09, 2024 at 06:54:33PM +0100, Marc Zyngier wrote:
> +static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
> +{
> +	return !(mmu->tlb_vttbr & 1);
> +}

More readable if you use VTTBR_CNP_BIT here.

>  struct kvm_arch_memory_slot {
>  };
>  
> @@ -264,6 +296,14 @@ struct kvm_arch {
>  	 */
>  	u64 fgu[__NR_FGT_GROUP_IDS__];
>  
> +	/*
> +	 * Stage 2 paging state for VMs with nested S2 using a virtual
> +	 * VMID.
> +	 */
> +	struct kvm_s2_mmu *nested_mmus;
> +	size_t nested_mmus_size;
> +	int nested_mmus_next;
> +
>  	/* Interrupt controller */
>  	struct vgic_dist	vgic;
>  
> @@ -1239,6 +1279,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
>  void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);
>  
>  int __init kvm_set_ipa_limit(void);
> +u32 kvm_get_pa_bits(struct kvm *kvm);
>  
>  #define __KVM_HAVE_ARCH_VM_ALLOC
>  struct kvm *kvm_arch_alloc_vm(void);
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index d5e48d870461..b48c9e12e7ff 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -98,6 +98,7 @@ alternative_cb_end
>  #include <asm/mmu_context.h>
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_host.h>
> +#include <asm/kvm_nested.h>
>  
>  void kvm_update_va_mask(struct alt_instr *alt,
>  			__le32 *origptr, __le32 *updptr, int nr_inst);
> @@ -165,6 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
>  int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
>  void __init free_hyp_pgds(void);
>  
> +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size);
>  void stage2_unmap_vm(struct kvm *kvm);
>  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
>  void kvm_uninit_stage2_mmu(struct kvm *kvm);
> @@ -326,5 +328,12 @@ static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
>  {
>  	return container_of(mmu->arch, struct kvm, arch);
>  }
> +
> +static inline u64 get_vmid(u64 vttbr)
> +{
> +	return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
> +		VTTBR_VMID_SHIFT;
> +}
> +
>  #endif /* __ASSEMBLY__ */
>  #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index c77d795556e1..1a4004e7573d 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -60,6 +60,12 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0)
>  	return ttbr0 & ~GENMASK_ULL(63, 48);
>  }
>  
> +extern void kvm_init_nested(struct kvm *kvm);
> +extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
> +extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
> +extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
> +extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
>  
>  int kvm_init_nv_sysregs(struct kvm *kvm);
>  
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index c4a0a35e02c7..bb19c53b6539 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -147,6 +147,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  	mutex_unlock(&kvm->lock);
>  #endif
>  
> +	kvm_init_nested(kvm);
> +
>  	ret = kvm_share_hyp(kvm, kvm + 1);
>  	if (ret)
>  		return ret;
> @@ -433,6 +435,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	struct kvm_s2_mmu *mmu;
>  	int *last_ran;
>  
> +	if (vcpu_has_nv(vcpu))
> +		kvm_vcpu_load_hw_mmu(vcpu);
> +
>  	mmu = vcpu->arch.hw_mmu;
>  	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
>  
> @@ -483,6 +488,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  	kvm_timer_vcpu_put(vcpu);
>  	kvm_vgic_put(vcpu);
>  	kvm_vcpu_pmu_restore_host(vcpu);
> +	if (vcpu_has_nv(vcpu))
> +		kvm_vcpu_put_hw_mmu(vcpu);
>  	kvm_arm_vmid_clear_active();
>  
>  	vcpu_clear_on_unsupported_cpu(vcpu);
> @@ -1346,6 +1353,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
>  	if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
>  		ret = kvm_arm_set_default_pmu(kvm);
>  
> +	/* Prepare for nested if required */
> +	if (!ret)
> +		ret = kvm_vcpu_init_nested(vcpu);
> +
>  	return ret;
>  }
>  
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index dc04bc767865..24a946814831 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -328,7 +328,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>  				   may_block));
>  }
>  
> -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
> +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
>  {
>  	__unmap_stage2_range(mmu, start, size, true);
>  }
> @@ -855,21 +855,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>  	.icache_inval_pou	= invalidate_icache_guest_page,
>  };
>  
> -/**
> - * kvm_init_stage2_mmu - Initialise a S2 MMU structure
> - * @kvm:	The pointer to the KVM structure
> - * @mmu:	The pointer to the s2 MMU structure
> - * @type:	The machine type of the virtual machine
> - *
> - * Allocates only the stage-2 HW PGD level table(s).
> - * Note we don't need locking here as this is only called when the VM is
> - * created, which can only be done once.
> - */
> -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
> +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
>  {
>  	u32 kvm_ipa_limit = get_kvm_ipa_limit();
> -	int cpu, err;
> -	struct kvm_pgtable *pgt;
>  	u64 mmfr0, mmfr1;
>  	u32 phys_shift;
>  
> @@ -896,11 +884,58 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>  	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
>  	mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
>  
> +	return 0;
> +}
> +
> +/**
> + * kvm_init_stage2_mmu - Initialise a S2 MMU structure
> + * @kvm:	The pointer to the KVM structure
> + * @mmu:	The pointer to the s2 MMU structure
> + * @type:	The machine type of the virtual machine
> + *
> + * Allocates only the stage-2 HW PGD level table(s).
> + * Note we don't need locking here as this is only called in two cases:
> + *
> + * - when the VM is created, which can't race against anything
> + *
> + * - when secondary kvm_s2_mmu structures are initialised for NV
> + *   guests, and the caller must hold kvm->lock as this is called on a
> + *   per-vcpu basis.
> + */
> +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
> +{
> +	int cpu, err;
> +	struct kvm_pgtable *pgt;
> +
> +	/*
> +	 * If we already have our page tables in place, and that the
> +	 * MMU context is the canonical one, we have a bug somewhere,
> +	 * as this is only supposed to ever happen once per VM.
> +	 *
> +	 * Otherwise, we're building nested page tables, and that's
> +	 * probably because userspace called KVM_ARM_VCPU_INIT more
> +	 * than once on the same vcpu. Since that's actually legal,
> +	 * don't kick a fuss and leave gracefully.
> +	 */
>  	if (mmu->pgt != NULL) {
> +		if (&kvm->arch.mmu != mmu)

A helper might be a good idea, I see this repeated several times:

static inline bool kvm_is_nested_s2_mmu(struct kvm_s2_mmu *mmu)
{
	return &arch->mmu != mmu;
}

> +			return 0;
> +
>  		kvm_err("kvm_arch already initialized?\n");
>  		return -EINVAL;
>  	}
>  
> +	/*
> +	 * We only initialise the IPA range on the canonical MMU, so
> +	 * the type is meaningless in all other situations.
> +	 */
> +	if (&kvm->arch.mmu != mmu)
> +		type = kvm_get_pa_bits(kvm);

I'm not sure I follow this comment, because kvm_init_ipa_range() still
gets called on nested MMUs. Is this suggesting that the configured IPA
limit of the shadow MMUs doesn't matter as they can only ever map things
in the canonical IPA space?

> +	err = kvm_init_ipa_range(mmu, type);
> +	if (err)
> +		return err;
> +
>  	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
>  	if (!pgt)
>  		return -ENOMEM;
> @@ -925,6 +960,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>  
>  	mmu->pgt = pgt;
>  	mmu->pgd_phys = __pa(pgt->pgd);
> +
> +	if (&kvm->arch.mmu != mmu)
> +		kvm_init_nested_s2_mmu(mmu);
> +
>  	return 0;
>  
>  out_destroy_pgtable:
> @@ -976,7 +1015,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
>  
>  		if (!(vma->vm_flags & VM_PFNMAP)) {
>  			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
> -			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
> +			kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
>  		}
>  		hva = vm_end;
>  	} while (hva < reg_end);
> @@ -2054,11 +2093,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
>  {
>  }
>  
> -void kvm_arch_flush_shadow_all(struct kvm *kvm)
> -{
> -	kvm_uninit_stage2_mmu(kvm);
> -}
> -
>  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  				   struct kvm_memory_slot *slot)
>  {
> @@ -2066,7 +2100,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>  	phys_addr_t size = slot->npages << PAGE_SHIFT;
>  
>  	write_lock(&kvm->mmu_lock);
> -	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
> +	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
>  	write_unlock(&kvm->mmu_lock);
>  }
>  
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index ced30c90521a..1f4f80a8c011 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -7,7 +7,9 @@
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
>  
> +#include <asm/kvm_arm.h>
>  #include <asm/kvm_emulate.h>
> +#include <asm/kvm_mmu.h>
>  #include <asm/kvm_nested.h>
>  #include <asm/sysreg.h>
>  
> @@ -16,6 +18,209 @@
>  /* Protection against the sysreg repainting madness... */
>  #define NV_FTR(r, f)		ID_AA64##r##_EL1_##f
>  
> +void kvm_init_nested(struct kvm *kvm)
> +{
> +	kvm->arch.nested_mmus = NULL;
> +	kvm->arch.nested_mmus_size = 0;
> +}
> +
> +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_s2_mmu *tmp;
> +	int num_mmus;
> +	int ret = -ENOMEM;
> +
> +	if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->kvm->arch.vcpu_features))
> +		return 0;
> +
> +	if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
> +		return -EINVAL;

nitpick: maybe guard the call to kvm_vcpu_init_nested() with
vcpu_has_nv() and collapse these into

	if (!vcpu_has_nv(vcpu))
		return -EINVAL;

> +	/*
> +	 * Let's treat memory allocation failures as benign: If we fail to
> +	 * allocate anything, return an error and keep the allocated array
> +	 * alive. Userspace may try to recover by intializing the vcpu
> +	 * again, and there is no reason to affect the whole VM for this.
> +	 */

This code feels a bit tricky, and I'm not sure much will be done to
recover the VM in practice should this allocation / ioctl fail.

Is it possible to do this late in kvm_arch_vcpu_run_pid_change() and
only have the first vCPU to reach the call do the initialization for the
whole VM? We could then dispose of the reallocation / fixup scheme
below.

If we keep this code...

> +	num_mmus = atomic_read(&kvm->online_vcpus) * 2;
> +	tmp = krealloc(kvm->arch.nested_mmus,
> +		       num_mmus * sizeof(*kvm->arch.nested_mmus),
> +		       GFP_KERNEL_ACCOUNT | __GFP_ZERO);

Just do an early 'return -ENOMEM' here to cut a level of indendation for
the rest that follows.

> +	if (tmp) {
> +		/*
> +		 * If we went through a realocation, adjust the MMU
> +		 * back-pointers in the previously initialised
> +		 * pg_table structures.

nitpick: pgtable or kvm_pgtable structures

> +		 */
> +		if (kvm->arch.nested_mmus != tmp) {
> +			int i;
> +
> +			for (i = 0; i < num_mmus - 2; i++)
> +				tmp[i].pgt->mmu = &tmp[i];
> +		}
> +
> +		if (kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 1], 0) ||
> +		    kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 2], 0)) {
> +			kvm_free_stage2_pgd(&tmp[num_mmus - 1]);
> +			kvm_free_stage2_pgd(&tmp[num_mmus - 2]);
> +		} else {
> +			kvm->arch.nested_mmus_size = num_mmus;
> +			ret = 0;
> +		}
> +
> +		kvm->arch.nested_mmus = tmp;
> +	}
> +
> +	return ret;
> +}
> +
> +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
> +{
> +	bool nested_stage2_enabled;
> +	u64 vttbr, vtcr, hcr;
> +	struct kvm *kvm;
> +	int i;
> +
> +	kvm = vcpu->kvm;

nit: just do this when declaring the local.

> +	lockdep_assert_held_write(&kvm->mmu_lock);
> +
> +	vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
> +	vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
> +	hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
> +
> +	nested_stage2_enabled = hcr & HCR_VM;
> +
> +	/* Don't consider the CnP bit for the vttbr match */
> +	vttbr = vttbr & ~VTTBR_CNP_BIT;

nit: &=

> +	/*
> +	 * Two possibilities when looking up a S2 MMU context:
> +	 *
> +	 * - either S2 is enabled in the guest, and we need a context that is
> +         *   S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
> +         *   which makes it safe from a TLB conflict perspective (a broken
> +         *   guest won't be able to generate them),
> +	 *
> +	 * - or S2 is disabled, and we need a context that is S2-disabled
> +         *   and matches the VMID only, as all TLBs are tagged by VMID even
> +         *   if S2 translation is disabled.
> +	 */

Looks like some spaces snuck in and got the indendation weird.

-- 
Thanks,
Oliver

  reply	other threads:[~2024-05-07  6:17 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-09 17:54 [PATCH 00/16] KVM: arm64: nv: Shadow stage-2 page table handling Marc Zyngier
2024-04-09 17:54 ` [PATCH 01/16] KVM: arm64: nv: Support multiple nested Stage-2 mmu structures Marc Zyngier
2024-05-07  6:17   ` Oliver Upton [this message]
2024-05-13 16:19     ` Marc Zyngier
2024-04-09 17:54 ` [PATCH 02/16] KVM: arm64: nv: Implement nested Stage-2 page table walk logic Marc Zyngier
2024-04-09 17:54 ` [PATCH 03/16] KVM: arm64: nv: Handle shadow stage 2 page faults Marc Zyngier
2024-04-09 17:54 ` [PATCH 04/16] KVM: arm64: nv: Unmap/flush shadow stage 2 page tables Marc Zyngier
2024-04-09 17:54 ` [PATCH 05/16] KVM: arm64: nv: Add Stage-1 EL2 invalidation primitives Marc Zyngier
2024-04-09 17:54 ` [PATCH 06/16] KVM: arm64: nv: Handle EL2 Stage-1 TLB invalidation Marc Zyngier
2024-04-09 17:54 ` [PATCH 07/16] KVM: arm64: nv: Handle TLB invalidation targeting L2 stage-1 Marc Zyngier
2024-04-09 17:54 ` [PATCH 08/16] KVM: arm64: nv: Handle TLBI VMALLS12E1{,IS} operations Marc Zyngier
2024-04-09 17:54 ` [PATCH 09/16] KVM: arm64: nv: Handle TLBI ALLE1{,IS} operations Marc Zyngier
2024-04-09 17:54 ` [PATCH 10/16] KVM: arm64: nv: Handle TLBI IPAS2E1{,IS} operations Marc Zyngier
2024-04-09 17:54 ` [PATCH 11/16] KVM: arm64: nv: Handle FEAT_TTL hinted TLB operations Marc Zyngier
2024-04-09 17:54 ` [PATCH 12/16] KVM: arm64: nv: Tag shadow S2 entries with guest's leaf S2 level Marc Zyngier
2024-04-09 17:54 ` [PATCH 13/16] KVM: arm64: nv: Invalidate TLBs based on shadow S2 TTL-like information Marc Zyngier
2024-04-09 17:54 ` [PATCH 14/16] KVM: arm64: nv: Add handling of outer-shareable TLBI operations Marc Zyngier
2024-04-09 17:54 ` [PATCH 15/16] KVM: arm64: nv: Add handling of range-based " Marc Zyngier
2024-04-09 17:54 ` [PATCH 16/16] KVM: arm64: nv: Add handling of NXS-flavoured " Marc Zyngier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZjnHaYjgMqcpfxdV@linux.dev \
    --to=oliver.upton@linux.dev \
    --cc=alexandru.elisei@arm.com \
    --cc=christoffer.dall@arm.com \
    --cc=james.morse@arm.com \
    --cc=joey.gouly@arm.com \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=maz@kernel.org \
    --cc=suzuki.poulose@arm.com \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).