Re: [PATCH V4 4/7] KVM, pkeys: add pkeys support for permission_fault logic

Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx> · Sun, 6 Mar 2016 16:00:07 +0800

On 03/05/2016 07:27 PM, Huaitong Han wrote:
Protection keys define a new 4-bit protection key field (PKEY) in bits
62:59 of leaf entries of the page tables, the PKEY is an index to PKRU
register(16 domains), every domain has 2 bits(write disable bit, access
disable bit).

Static logic has been produced in update_permission_bitmask, dynamic logic
need read pkey from page table entries, get pkru value, and deduce the
correct result.

Signed-off-by: Huaitong Han <huaitong.han@xxxxxxxxx>
---
Changes in v4:
*Patch has rebased on http://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/log/?h=mm/pkeys
  pte_pkey function has been deleted in kernel PKU support, so KVM patches use
  pte_flags_pkey(pte_flags(pte)) instead of pte_pkey(pte).

  arch/x86/kvm/mmu.h         | 56 +++++++++++++++++++++++++++++++++++++++++-----
  arch/x86/kvm/paging_tmpl.h | 18 ++++++++++++---
  arch/x86/kvm/x86.c         |  2 +-
  3 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 55ffb7b..7a9f627 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,7 @@

  #include <linux/kvm_host.h>
  #include "kvm_cache_regs.h"
+#include "x86.h"

  #define PT64_PT_BITS 9
  #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -24,6 +25,7 @@
  #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
  #define PT_PAT_MASK (1ULL << 7)
  #define PT_GLOBAL_MASK (1ULL << 8)
+
  #define PT64_NX_SHIFT 63
  #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)

@@ -45,6 +47,10 @@
  #define PT_PAGE_TABLE_LEVEL 1
  #define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)

+#define PKRU_READ   0
+#define PKRU_WRITE  1
+#define PKRU_ATTRS  2
+
  static inline u64 rsvd_bits(int s, int e)
  {
  	return ((1ULL << (e - s + 1)) - 1) << s;
@@ -145,10 +151,50 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
   * fault with the given access (in ACC_* format)?
   */
  static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				    unsigned pte_access, unsigned pfec)
+		unsigned pte_access, unsigned pte_pkeys, unsigned pfec)
  {
-	int cpl = kvm_x86_ops->get_cpl(vcpu);
-	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+	unsigned long smap, rflags;
+	u32 pkru, pkru_bits;
+	int cpl, index;
+	bool wf, uf;
+
+	cpl = kvm_x86_ops->get_cpl(vcpu);
+	rflags = kvm_x86_ops->get_rflags(vcpu);
+
+	/*
+	* PKU is computed dynamically in permission_fault.
+	* 2nd and 6th conditions:
+	* 2.EFER_LMA=1
+	* 6.PKRU.AD=1
+	*	or The access is a data write and PKRU.WD=1 and
+	*	   either CR0.WP=1 or it is a user mode access
+	*/
+	pkru = is_long_mode(vcpu) ? read_pkru() : 0;
+	if (unlikely(pkru) && (pfec & PFERR_PK_MASK))
+	{
+		/*
+		* PKRU defines 32 bits, there are 16 domains and 2 attribute bits per
+		* domain in pkru, pkey is the index to a defined domain, so the value
+		* of pkey * PKRU_ATTRS is offset of a defined domain.
+		*/
+		pkru_bits = (pkru >> (pte_pkeys * PKRU_ATTRS)) & 3;
+
+		wf = pfec & PFERR_WRITE_MASK;
+		uf = pfec & PFERR_USER_MASK;
+
+		/*
+		* Ignore PKRU.WD if not relevant to this access (a read,
+		* or a supervisor mode access if CR0.WP=0).
+		* So 6th conditions is equivalent to "pkru_bits != 0"
+		*/
+		if (!wf || (!uf && !is_write_protection(vcpu)))
+			pkru_bits &= ~(1 << PKRU_WRITE);
+
+		/* Flip pfec on PK bit if pkru_bits is zero */
+		pfec ^= pkru_bits ? 0 : PFERR_PK_MASK;
+	}
+	else
+		pfec &= ~PFERR_PK_MASK;

  	/*
  	 * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
@@ -163,8 +209,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  	 * but it will be one in index if SMAP checks are being overridden.
  	 * It is important to keep this branchless.
  	 */
-	unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
-	int index = (pfec >> 1) +
+	smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
+	index = (pfec >> 1) +
  		    (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));

See my comments in the previous patch.


  	WARN_ON(pfec & PFERR_RSVD_MASK);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c9fed9..9afc066 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -253,6 +253,15 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
  	}
  	return 0;
  }
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned pkeys = 0;
+#if PTTYPE == 64
+	pte_t pte = {.pte = gpte};
+	pkeys = pte_flags_pkey(pte_flags(pte));
+#endif

It works for PAE guest either.

It is not a problem as the bits used by PKEY are reserved on PAE mode which should
be figured out by is_rsvd_bits_set() prior to this call and PKEY is ignored if
EFER.LMA = 0. But please add some comments here to explain why it is safe.

+	return pkeys;
+}

  /*
   * Fetch a guest pte for a guest virtual address
@@ -265,12 +274,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
  	pt_element_t pte;
  	pt_element_t __user *uninitialized_var(ptep_user);
  	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access, accessed_dirty;
+	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkeys;
  	gpa_t pte_gpa;
  	int offset;
  	const int write_fault = access & PFERR_WRITE_MASK;
  	const int user_fault  = access & PFERR_USER_MASK;
  	const int fetch_fault = access & PFERR_FETCH_MASK;
+	const int pk_fault = access & PFERR_PK_MASK;
  	u16 errcode = 0;
  	gpa_t real_gpa;
  	gfn_t gfn;
@@ -356,7 +366,9 @@ retry_walk:
  		walker->ptes[walker->level - 1] = pte;
  	} while (!is_last_gpte(mmu, walker->level, pte));

-	if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
+	pte_pkeys = FNAME(gpte_pkeys)(vcpu, pte);
+	if (unlikely(permission_fault(vcpu, mmu, pte_access, pte_pkeys,
+					access))) {
  		errcode |= PFERR_PRESENT_MASK;
  		goto error;
  	}
@@ -399,7 +411,7 @@ retry_walk:
  	return 1;

  error:
-	errcode |= write_fault | user_fault;
+	errcode |= write_fault | user_fault | pk_fault;

No. The pk_fault is not always caused by guest page table as it's depends on
CR0.WP which is always true on shadow page table.

  	if (fetch_fault && (mmu->nx ||
  			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
  		errcode |= PFERR_FETCH_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05e7838..b697a99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4317,7 +4317,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,

  	if (vcpu_match_mmio_gva(vcpu, gva)
  	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-				 vcpu->arch.access, access)) {
+				 vcpu->arch.access, 0, access)) {

No. The pkey is not always 0.

We should cache PKEY for the mmio access and use it here to check if the right
is adequate.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html