Patch "[PATCH v2 for-4.9 25/40] powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown" has been added to the 4.9-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    [PATCH v2 for-4.9 25/40] powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown

to the 4.9-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     powerpc-mm-iommu-vfio-spapr-put-pages-on-vfio-container-shutdown.patch
and it can be found in the queue-4.9 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.


>From foo@baz Mon Mar 20 11:41:01 CET 2017
From: alexander.levin@xxxxxxxxxxx
Date: Fri, 17 Mar 2017 00:48:27 +0000
Subject: [PATCH v2 for-4.9 25/40] powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown
To: "gregkh@xxxxxxxxxxxxxxxxxxx" <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: "stable@xxxxxxxxxxxxxxx" <stable@xxxxxxxxxxxxxxx>
Message-ID: <20170317004812.26960-25-alexander.levin@xxxxxxxxxxx>

From: Alexey Kardashevskiy <aik@xxxxxxxxx>

[ Upstream commit 4b6fad7097f883335b6d9627c883cb7f276d94c9 ]

At the moment the userspace tool is expected to request pinning of
the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present.
When the userspace process finishes, all the pinned pages need to
be put; this is done as a part of the userspace memory context (MM)
destruction which happens on the very last mmdrop().

This approach has a problem that a MM of the userspace process
may live longer than the userspace process itself as kernel threads
use userspace process MMs which was runnning on a CPU where
the kernel thread was scheduled to. If this happened, the MM remains
referenced until this exact kernel thread wakes up again
and releases the very last reference to the MM, on an idle system this
can take even hours.

This moves preregistered regions tracking from MM to VFIO; insteads of
using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is
added so each container releases regions which it has pre-registered.

This changes the userspace interface to return EBUSY if a memory
region is already registered in a container. However it should not
have any practical effect as the only userspace tool available now
does register memory region once per container anyway.

As tce_iommu_register_pages/tce_iommu_unregister_pages are called
under container->lock, this does not need additional locking.

Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>
Reviewed-by: Nicholas Piggin <npiggin@xxxxxxxxx>
Acked-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Signed-off-by: Sasha Levin <alexander.levin@xxxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
 arch/powerpc/mm/mmu_context_book3s64.c |    4 --
 arch/powerpc/mm/mmu_context_iommu.c    |   11 -----
 drivers/vfio/vfio_iommu_spapr_tce.c    |   61 ++++++++++++++++++++++++++++++++-
 3 files changed, 61 insertions(+), 15 deletions(-)

--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -156,13 +156,11 @@ static inline void destroy_pagetable_pag
 }
 #endif
 
-
 void destroy_context(struct mm_struct *mm)
 {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-	mm_iommu_cleanup(mm);
+	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
-
 #ifdef CONFIG_PPC_ICSWX
 	drop_cop(mm->context.acop, mm);
 	kfree(mm->context.cop_lockp);
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -365,14 +365,3 @@ void mm_iommu_init(struct mm_struct *mm)
 {
 	INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
 }
-
-void mm_iommu_cleanup(struct mm_struct *mm)
-{
-	struct mm_iommu_table_group_mem_t *mem, *tmp;
-
-	list_for_each_entry_safe(mem, tmp, &mm->context.iommu_group_mem_list,
-			next) {
-		list_del_rcu(&mem->next);
-		mm_iommu_do_free(mem);
-	}
-}
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -89,6 +89,15 @@ struct tce_iommu_group {
 };
 
 /*
+ * A container needs to remember which preregistered region  it has
+ * referenced to do proper cleanup at the userspace process exit.
+ */
+struct tce_iommu_prereg {
+	struct list_head next;
+	struct mm_iommu_table_group_mem_t *mem;
+};
+
+/*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
  * at the moment of initialization.
@@ -101,6 +110,7 @@ struct tce_container {
 	struct mm_struct *mm;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 	struct list_head group_list;
+	struct list_head prereg_list;
 };
 
 static long tce_iommu_mm_set(struct tce_container *container)
@@ -117,10 +127,27 @@ static long tce_iommu_mm_set(struct tce_
 	return 0;
 }
 
+static long tce_iommu_prereg_free(struct tce_container *container,
+		struct tce_iommu_prereg *tcemem)
+{
+	long ret;
+
+	ret = mm_iommu_put(container->mm, tcemem->mem);
+	if (ret)
+		return ret;
+
+	list_del(&tcemem->next);
+	kfree(tcemem);
+
+	return 0;
+}
+
 static long tce_iommu_unregister_pages(struct tce_container *container,
 		__u64 vaddr, __u64 size)
 {
 	struct mm_iommu_table_group_mem_t *mem;
+	struct tce_iommu_prereg *tcemem;
+	bool found = false;
 
 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 		return -EINVAL;
@@ -129,7 +156,17 @@ static long tce_iommu_unregister_pages(s
 	if (!mem)
 		return -ENOENT;
 
-	return mm_iommu_put(container->mm, mem);
+	list_for_each_entry(tcemem, &container->prereg_list, next) {
+		if (tcemem->mem == mem) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	return tce_iommu_prereg_free(container, tcemem);
 }
 
 static long tce_iommu_register_pages(struct tce_container *container,
@@ -137,16 +174,29 @@ static long tce_iommu_register_pages(str
 {
 	long ret = 0;
 	struct mm_iommu_table_group_mem_t *mem = NULL;
+	struct tce_iommu_prereg *tcemem;
 	unsigned long entries = size >> PAGE_SHIFT;
 
 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 			((vaddr + size) < vaddr))
 		return -EINVAL;
 
+	mem = mm_iommu_find(container->mm, vaddr, entries);
+	if (mem) {
+		list_for_each_entry(tcemem, &container->prereg_list, next) {
+			if (tcemem->mem == mem)
+				return -EBUSY;
+		}
+	}
+
 	ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
 	if (ret)
 		return ret;
 
+	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
+	tcemem->mem = mem;
+	list_add(&tcemem->next, &container->prereg_list);
+
 	container->enabled = true;
 
 	return 0;
@@ -333,6 +383,7 @@ static void *tce_iommu_open(unsigned lon
 
 	mutex_init(&container->lock);
 	INIT_LIST_HEAD_RCU(&container->group_list);
+	INIT_LIST_HEAD_RCU(&container->prereg_list);
 
 	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
@@ -371,6 +422,14 @@ static void tce_iommu_release(void *iomm
 		tce_iommu_free_table(container, tbl);
 	}
 
+	while (!list_empty(&container->prereg_list)) {
+		struct tce_iommu_prereg *tcemem;
+
+		tcemem = list_first_entry(&container->prereg_list,
+				struct tce_iommu_prereg, next);
+		WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
+	}
+
 	tce_iommu_disable(container);
 	if (container->mm)
 		mmdrop(container->mm);


Patches currently in stable-queue which might be from gregkh@xxxxxxxxxxxxxxxxxxx are

queue-4.9/pci-add-comments-about-rom-bar-updating.patch
queue-4.9/acpi-blacklist-make-dell-latitude-3350-ethernet-work.patch
queue-4.9/s390-zcrypt-introduce-cex6-toleration.patch
queue-4.9/dccp-tcp-fix-routing-redirect-race.patch
queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch
queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch
queue-4.9/block-allow-write_same-commands-with-the-sg_io-ioctl.patch
queue-4.9/strparser-destroy-workqueue-on-module-exit.patch
queue-4.9/powerpc-mm-fix-build-break-when-cma-n-spapr_tce_iommu-y.patch
queue-4.9/vfio-spapr-postpone-default-window-creation.patch
queue-4.9/vfio-spapr-add-a-helper-to-create-default-dma-window.patch
queue-4.9/pci-do-any-vf-bar-updates-before-enabling-the-bars.patch
queue-4.9/usb-gadget-udc-atmel-remove-memory-leak.patch
queue-4.9/x86-hyperv-handle-unknown-nmis-on-one-cpu-when-unknown_nmi_panic.patch
queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch
queue-4.9/serial-8250_pci-detach-low-level-driver-during-pci-error-recovery.patch
queue-4.9/powerpc-iommu-stop-using-current-in-mm_iommu_xxx.patch
queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch
queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch
queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch
queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch
queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch
queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch
queue-4.9/xen-do-not-re-use-pirq-number-cached-in-pci-device-msi-msg-data.patch
queue-4.9/vxlan-lock-rcu-on-tx-path.patch
queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch
queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch
queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch
queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch
queue-4.9/pci-separate-vf-bar-updates-from-standard-bar-updates.patch
queue-4.9/pci-ignore-bar-updates-on-virtual-functions.patch
queue-4.9/geneve-lock-rcu-on-tx-path.patch
queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch
queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch
queue-4.9/powerpc-mm-iommu-vfio-spapr-put-pages-on-vfio-container-shutdown.patch
queue-4.9/bpf-fix-state-equivalence.patch
queue-4.9/scsi-ibmvscsis-clean-up-properly-if-target_submit_cmd-tmr-fails.patch
queue-4.9/drm-nouveau-disp-gp102-fix-cursor-overlay-immediate-channel-indices.patch
queue-4.9/pci-update-bars-using-property-bits-appropriate-for-type.patch
queue-4.9/scsi-ibmvscsis-synchronize-cmds-at-remove-time.patch
queue-4.9/vfio-spapr-postpone-allocation-of-userspace-version-of-tce-table.patch
queue-4.9/ibmveth-calculate-gso_segs-for-large-packets.patch
queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch
queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch
queue-4.9/pci-don-t-update-vf-bars-while-vf-memory-space-is-enabled.patch
queue-4.9/ipv4-mask-tos-for-input-route.patch
queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch
queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch
queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch
queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch
queue-4.9/igb-workaround-for-igb-i210-firmware-issue.patch
queue-4.9/drivers-hv-ring_buffer-count-on-wrap-around-mappings-in-get_next_pkt_raw-v2.patch
queue-4.9/drm-nouveau-disp-nv50-specify-ctrl-user-separately-when-constructing-classes.patch
queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch
queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch
queue-4.9/pci-remove-pci_resource_bar-and-pci_iov_resource_bar.patch
queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch
queue-4.9/dmaengine-iota-ioat_alloc_chan_resources-should-not-perform-sleeping-allocations.patch
queue-4.9/scsi-ibmvscsis-return-correct-partition-name-to-client.patch
queue-4.9/vti6-return-gre_key-for-vti6.patch
queue-4.9/vfio-spapr-reference-mm-in-tce_container.patch
queue-4.9/scsi-ibmvscsis-rearrange-functions-for-future-patches.patch
queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch
queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch
queue-4.9/powerpc-iommu-pass-mm_struct-to-init-cleanup-helpers.patch
queue-4.9/slub-move-synchronize_sched-out-of-slab_mutex-on-shrink.patch
queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch
queue-4.9/pci-decouple-ioresource_rom_enable-and-pci_rom_address_enable.patch
queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch
queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch
queue-4.9/scsi-ibmvscsis-issues-from-dan-carpenter-smatch.patch
queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch
queue-4.9/acpi-blacklist-add-_rev-quirks-for-dell-precision-5520-and-3520.patch
queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch
queue-4.9/igb-add-i211-to-i210-phy-workaround.patch
queue-4.9/drm-nouveau-disp-nv50-split-chid-into-chid.ctrl-and-chid.user.patch
queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch
queue-4.9/scsi-ibmvscsis-synchronize-cmds-at-tpg_enable_store-time.patch
queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch
queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch
queue-4.9/uvcvideo-uvc_scan_fallback-for-webcams-with-broken-chain.patch



[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]