This is my first patch, an attempt to implement Memory ROE discussed by me earlier as a way to prevent Rootkits. I have already explained in details in this thread: https://www.mail-archive.com/kernelnewbies@xxxxxxxxxxxxxxxxx/msg18826.html So I think there is no need for saying the exact same thing again. The problem is that the code isn't working and I can't figure out why I tried implementing the protection to follow similar behavior to that of KVM_MEM_READONLY but to be on page (SPTE) level The current problem I am facing is that when handling the hypercall vcpu->mode turns to be OUTSIDE_GUEST_MODE but KVM_REQ_TLB_FLUSH doesn't seem to be handled correctly. KVM documentation promised that when VCPU is not in GUEST_MODE VCPU are handled asap and kvm_vcpu_kick(vcpu); will even force that, but it doesn't seem to be the case for me. This is the kind of logging I am getting: [3556.312299] kvm_mmu_slot_apply_flags: visited [3556.312301] kvm_mmu_slot_apply_write_access: Flush = false [3557.034243] gfn_is_readonly: test_bit = 0 [3557.034251] gfn_is_readonly: test_bit = 0 [3557.034254] gfn_is_readonly: test_bit = 0 [3557.034463] Hypercall received, page address 0x0 [3557.034466] gfn_is_readonly: test_bit = 0 [3557.034469] kvm_mroe: flush state = Done [3557.034472] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE [3557.034475] Setting page number 0 in slot number 0 [3557.034480] slot_rmap_apply_protection: The 0th page is readonly, Flush = True [3557.034483] kvm_mmu_slot_apply_write_access: Flush = true [3557.034486] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE [3557.034488] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE [3557.034490] kvm_mroe: flush state = Waiting For some reason kvm_vcpu_kick() didn't force the KVM_REQ_TLB_FLUSH to kick into the virtual cpu (I am talking about the last 2 lines). I am aware that there is still alot missing (like dealing with malicious guest remappings) and the code quality sucks, but any ideas about what I could be doing wrong (or ideas in general) would be apprciated. I am already planning to do everything cleanly once it works. Thansk. Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@xxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 7 ++- arch/x86/kvm/Kconfig | 7 +++ arch/x86/kvm/mmu.c | 127 +++++++++++++++++++++++++++------------- arch/x86/kvm/x86.c | 83 ++++++++++++++++++++++++-- include/linux/kvm_host.h | 17 ++++++ include/uapi/linux/kvm_para.h | 4 +- virt/kvm/kvm_main.c | 36 +++++++++--- 7 files changed, 226 insertions(+), 55 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..c66e9245f750 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -235,7 +235,10 @@ struct kvm_mmu_memory_cache { int nobjs; void *objects[KVM_NR_MEM_OBJS]; }; - +struct kvm_write_access_data { + int i; + struct kvm_memory_slot *memslot; +}; /* * the pages used as guest page table on soft mmu are tracked by * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used @@ -1130,7 +1133,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, +void kvm_mmu_slot_apply_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 92fd433c50b9..8ae822a8dc7a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -96,6 +96,13 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_MROE + bool "Hypercall Memory Read-Only Enforcement" + depends on KVM && X86 + help + This option add KVM_HC_HMROE hypercall to kvm which as hardening + mechanism to protect memory pages from being edited. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d594690d8b95..946545b8b8cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -70,7 +70,7 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG -static bool dbg = 0; +static bool dbg = 1; module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) @@ -1402,7 +1402,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; - if (!is_writable_pte(spte) && !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; @@ -1418,15 +1417,23 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) static bool __rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - bool pt_protect) + bool pt_protect, + struct kvm_write_access_data *d) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - - for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(sptep, pt_protect); - + if (d == NULL) { + for_each_rmap_spte(rmap_head, &iter, sptep) { + flush |= spte_write_protect(sptep, + !test_bit(d->i, d->memslot->mroe_bitmap) + && pt_protect); + d->i++; + } + } else { + for_each_rmap_spte(rmap_head, &iter, sptep) + flush |= spte_write_protect(sptep, pt_protect); + } return flush; } @@ -1457,7 +1464,8 @@ static bool wrprot_ad_disabled_spte(u64 *sptep) * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ -static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + void *data) { u64 *sptep; struct rmap_iterator iter; @@ -1483,7 +1491,8 @@ static bool spte_set_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + void *data) { u64 *sptep; struct rmap_iterator iter; @@ -1515,7 +1524,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmap_head, false); + __rmap_write_protect(kvm, rmap_head, false, NULL); /* clear the first set bit */ mask &= mask - 1; @@ -1541,7 +1550,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PT_PAGE_TABLE_LEVEL, slot); - __rmap_clear_dirty(kvm, rmap_head); + __rmap_clear_dirty(kvm, rmap_head, NULL); /* clear the first set bit */ mask &= mask - 1; @@ -1591,10 +1600,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - + struct kvm_write_access_data data = { + .i = 0, + .memslot = slot, + }; for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + write_protected |= __rmap_write_protect(kvm, rmap_head, true, + &data); } return write_protected; @@ -1608,7 +1621,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + void *data) { u64 *sptep; struct rmap_iterator iter; @@ -1628,7 +1642,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmap_head); + return kvm_zap_rmapp(kvm, rmap_head, NULL); } static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -5086,13 +5100,15 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) } /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, void *data); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb, + void *data) { struct slot_rmap_walk_iterator iterator; bool flush = false; @@ -5100,7 +5116,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + flush |= fn(kvm, iterator.rmap, data); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { if (flush && lock_flush_tlb) { @@ -5122,36 +5138,36 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, static __always_inline bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) + bool lock_flush_tlb, void *data) { return slot_handle_level_range(kvm, memslot, fn, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); + lock_flush_tlb, data); } static __always_inline bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool lock_flush_tlb, void *data) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data); } static __always_inline bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool lock_flush_tlb, void *data) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data); } static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool lock_flush_tlb, void *data) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); + PT_PAGE_TABLE_LEVEL, lock_flush_tlb, data); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -5173,7 +5189,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + start, end - 1, true, NULL); } } @@ -5181,23 +5197,52 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } static bool slot_rmap_write_protect(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + void *data) { - return __rmap_write_protect(kvm, rmap_head, false); + return __rmap_write_protect(kvm, rmap_head, false, + (struct kvm_write_access_data *)data); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, +static bool slot_rmap_apply_protection(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + void *data) +{ + struct kvm_write_access_data *d = (struct kvm_write_access_data *) data; + unsigned long *protection = d->memslot->mroe_bitmap; + bool prot_mask = d->memslot->flags & KVM_MEM_READONLY; + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for_each_rmap_spte(rmap_head, &iter, sptep) { + flush |= spte_write_protect(sptep, + !(test_bit(d->i, protection) || prot_mask)); + if (test_bit(d->i, protection)) { + pr_info("%s: The %dth page is readonly, Flush = %s\n", + __func__, d->i, flush?"True" : "False"); + } + d->i++; + } + return flush; +} + +void kvm_mmu_slot_apply_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { bool flush; - + struct kvm_write_access_data data = { + .i = 0, + .memslot = memslot, + }; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, - false); + flush = slot_handle_all_level(kvm, memslot, slot_rmap_apply_protection, + false, &data); + pr_info("%s: Flush = %s\n", __func__, flush ? "true":"false"); spin_unlock(&kvm->mmu_lock); /* - * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() + * kvm_mmu_slot_apply_write_access() and kvm_vm_ioctl_get_dirty_log() * which do tlb flush out of mmu-lock should be serialized by * kvm->slots_lock otherwise tlb flush would be missed. */ @@ -5219,7 +5264,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - struct kvm_rmap_head *rmap_head) + struct kvm_rmap_head *rmap_head, + void *data) { u64 *sptep; struct rmap_iterator iter; @@ -5257,7 +5303,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, - kvm_mmu_zap_collapsible_spte, true); + kvm_mmu_zap_collapsible_spte, true, NULL); spin_unlock(&kvm->mmu_lock); } @@ -5267,7 +5313,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false, NULL); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -5290,10 +5336,10 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, - false); + false, NULL); spin_unlock(&kvm->mmu_lock); - /* see kvm_mmu_slot_remove_write_access */ + /* see kvm_mmu_slot_apply_write_access */ lockdep_assert_held(&kvm->slots_lock); if (flush) @@ -5307,7 +5353,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false, + NULL); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0046aa70205a..96e967199fda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,7 +55,7 @@ #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/mem_encrypt.h> - +#include <linux/mempolicy.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> @@ -4177,7 +4177,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) /* * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). + * kvm_mmu_slot_apply_write_access(). */ lockdep_assert_held(&kvm->slots_lock); if (is_dirty) @@ -6669,7 +6669,74 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, return ret; } #endif +#ifdef CONFIG_KVM_MROE +static int roe_protect_frame(struct kvm *kvm, gpa_t gpa) +{ + struct kvm_memory_slot *slot; + gfn_t gfn = gpa >> PAGE_SHIFT; + + slot = gfn_to_memslot(kvm, gfn); + //XXX do some error checking dude. + if (gfn > slot->base_gfn + slot->npages) { + //XXX use a better language + pr_err("You have an overflow\n"); + return -1; + } + pr_info("Setting page number %lld in slot number %d\n", + gfn - slot->base_gfn, slot->id); + // something is wrong with the locking here + // you should lock the area before writing the bit + set_bit(gfn - slot->base_gfn, slot->mroe_bitmap); + kvm_mmu_slot_apply_write_access(kvm, slot); + return 0; +} +void debug_cpu_mode(struct kvm_vcpu *vcpu) +{ + char *mode = "Unknown"; + + if (vcpu->mode == OUTSIDE_GUEST_MODE) + mode = "OUTSIDE_GUEST_MODE"; + else if (vcpu->mode == IN_GUEST_MODE) + mode = "IN_GUEST_MODE"; + else if (vcpu->mode == EXITING_GUEST_MODE) + mode = "EXITING_GUEST_MODE"; + else if (vcpu->mode == READING_SHADOW_PAGE_TABLES) + mode = "READING_SHADOW_PAGE_TABLES"; + pr_info("kvm_mroe: cpu mode = %s\n", mode); +} +static int kvm_mroe(struct kvm_vcpu *vcpu, u64 gva) +{ + struct kvm *kvm = vcpu->kvm; + gpa_t gpa; + u64 hva; + int ret; + //XXX check that the hypercall is done from kernel mode + if (gva & ~PAGE_MASK) + return -EINVAL; + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + hva = gfn_to_hva(kvm, gpa >> PAGE_SHIFT); + //XXX This doesn't work but it will be ok to check that we can access + // the address and make sure that the mapping makes sense + if (!access_ok(VERIFY_WRITE, hva, PAGE_SIZE)) { + pr_info("Duplicate request\n"); + return -KVM_EROEDUPLICATR; + } + pr_info("%s: flush state = %s\n", __func__, + kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" : + "Done"); + debug_cpu_mode(vcpu); + ret = roe_protect_frame(vcpu->kvm, gpa); + debug_cpu_mode(vcpu); + kvm_vcpu_kick(vcpu); + debug_cpu_mode(vcpu); + pr_info("%s: flush state = %s\n", __func__, + kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" : + "Done"); + + return ret; +} +#endif /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6737,6 +6804,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; +#endif +#ifdef CONFIG_KVM_MROE + case KVM_HC_HMROE: + pr_info("Hypercall received, page address 0x%lx\n", a0); + ret = kvm_mroe(vcpu, a0); + break; #endif default: ret = -KVM_ENOSYS; @@ -8971,8 +9044,10 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *new) { /* Still write protect RO slot */ + pr_info("%s: visited\n", __func__); + kvm_mmu_slot_apply_write_access(kvm, new); + return; if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); return; } @@ -9010,7 +9085,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (kvm_x86_ops->slot_enable_log_dirty) kvm_x86_ops->slot_enable_log_dirty(kvm, new); else - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_apply_write_access(kvm, new); } else { if (kvm_x86_ops->slot_disable_log_dirty) kvm_x86_ops->slot_disable_log_dirty(kvm, new); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ee7bc548a83..1ca6db7b8931 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -7,6 +7,7 @@ */ #include <linux/types.h> +#include <linux/hashtable.h> #include <linux/hardirq.h> #include <linux/list.h> #include <linux/mutex.h> @@ -297,6 +298,9 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; +#ifdef CONFIG_KVM_MROE + unsigned long *mroe_bitmap; +#endif unsigned long *dirty_bitmap; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; @@ -387,6 +391,13 @@ struct kvm_memslots { int used_slots; }; +#ifdef CONFIG_KVM_MROE +struct roe_page { + void *page_start; + struct hlist_node hash_list; +}; +#endif + struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; @@ -440,6 +451,12 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif + +#ifdef CONFIG_KVM_MROE + //TODO tune hash size; + #define KVM_MROE_HASH_SIZE 8 + DECLARE_HASHTABLE(roe_pages, KVM_MROE_HASH_SIZE); +#endif long tlbs_dirty; struct list_head devices; struct dentry *debugfs_dentry; diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index dcf629dd2889..2be960477649 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -17,6 +17,8 @@ #define KVM_EPERM EPERM #define KVM_EOPNOTSUPP 95 +#define KVM_EROEDUPLICATR 1 + #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 #define KVM_HC_FEATURES 3 @@ -26,7 +28,7 @@ #define KVM_HC_MIPS_EXIT_VM 7 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 #define KVM_HC_CLOCK_PAIRING 9 - +#define KVM_HC_HMROE 10 /* * hypercalls use architecture specific */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..ca1b95a16a8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -634,7 +634,6 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->slots_lock); refcount_set(&kvm->users_count, 1); INIT_LIST_HEAD(&kvm->devices); - r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_disable; @@ -794,6 +793,17 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) return 0; } +static int kvm_init_mroe_bitmap(struct kvm_memory_slot *slot) +{ +#ifdef CONFIG_KVM_MROE + slot->mroe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) * + sizeof(unsigned long), GFP_KERNEL); + if (!slot->mroe_bitmap) + return -ENOMEM; +#endif + return 0; +} + /* * Insert memslot and re-sort memslots based on their GFN, * so binary search could be used to lookup GFN. @@ -1011,7 +1021,8 @@ int __kvm_set_memory_region(struct kvm *kvm, if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; } - + if (kvm_init_mroe_bitmap(&new) < 0) + goto out_free; slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out_free; @@ -1263,16 +1274,25 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } - +static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn) +{ +#ifdef CONFIG_KVM_MROE + pr_info("%s: test_bit = %d", __func__, + test_bit(gfn - slot->base_gfn, slot->mroe_bitmap)); + ///dump_stack(); + return test_bit(gfn - slot->base_gfn, slot->mroe_bitmap) || + memslot_is_readonly(slot); +#else + return memslot_is_readonly(slot); +#endif +} static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return KVM_HVA_ERR_BAD; - - if (memslot_is_readonly(slot) && write) + if (gfn_is_readonly(slot, gfn) && write) return KVM_HVA_ERR_RO_BAD; - if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); @@ -1314,7 +1334,7 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) - *writable = !memslot_is_readonly(slot); + *writable = !gfn_is_readonly(slot, gfn); return hva; } @@ -1554,7 +1574,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, } /* Do not map writable pfn in the readonly memslot. */ - if (writable && memslot_is_readonly(slot)) { + if (writable && gfn_is_readonly(slot, gfn)) { *writable = false; writable = NULL; } -- 2.16.4