Hi, Le jeudi 11 décembre 2014 à 17:04 +0200, Haggai Eran a écrit : > From: Sagi Grimberg <sagig@xxxxxxxxxxxx> > > * Add a configuration option for enable on-demand paging support in the > infiniband subsystem (CONFIG_INFINIBAND_ON_DEMAND_PAGING). In a later patch, > this configuration option will select the MMU_NOTIFIER configuration option > to enable mmu notifiers. > * Add a flag for on demand paging (ODP) support in the IB device capabilities. > * Add a flag to request ODP MR in the access flags to reg_mr. > * Fail registrations done with the ODP flag when the low-level driver doesn't > support this. > * Change the conditions in which an MR will be writable to explicitly > specify the access flags. This is to avoid making an MR writable just > because it is an ODP MR. > * Add a ODP capabilities to the extended query device verb. > > Signed-off-by: Sagi Grimberg <sagig@xxxxxxxxxxxx> > Signed-off-by: Shachar Raindel <raindel@xxxxxxxxxxxx> > Signed-off-by: Haggai Eran <haggaie@xxxxxxxxxxxx> > --- > drivers/infiniband/Kconfig | 10 ++++++++++ > drivers/infiniband/core/umem.c | 8 +++++--- > drivers/infiniband/core/uverbs_cmd.c | 25 +++++++++++++++++++++++++ > include/rdma/ib_verbs.h | 28 ++++++++++++++++++++++++++-- > include/uapi/rdma/ib_user_verbs.h | 15 +++++++++++++++ > 5 files changed, 81 insertions(+), 5 deletions(-) > > diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig > index 77089399359b..089a2c2af329 100644 > --- a/drivers/infiniband/Kconfig > +++ b/drivers/infiniband/Kconfig > @@ -38,6 +38,16 @@ config INFINIBAND_USER_MEM > depends on INFINIBAND_USER_ACCESS != n > default y > > +config INFINIBAND_ON_DEMAND_PAGING > + bool "InfiniBand on-demand paging support" > + depends on INFINIBAND_USER_MEM > + default y > + ---help--- > + On demand paging support for the InfiniBand subsystem. > + Together with driver support this allows registration of > + memory regions without pinning their pages, fetching the > + pages on demand instead. > + > config INFINIBAND_ADDR_TRANS > bool > depends on INFINIBAND > diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c > index 6f152628e0d2..c328e4693d14 100644 > --- a/drivers/infiniband/core/umem.c > +++ b/drivers/infiniband/core/umem.c > @@ -107,13 +107,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, > umem->page_size = PAGE_SIZE; > umem->pid = get_task_pid(current, PIDTYPE_PID); > /* > - * We ask for writable memory if any access flags other than > - * "remote read" are set. "Local write" and "remote write" > + * We ask for writable memory if any of the following > + * access flags are set. "Local write" and "remote write" > * obviously require write access. "Remote atomic" can do > * things like fetch and add, which will modify memory, and > * "MW bind" can change permissions by binding a window. > */ > - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); > + umem->writable = !!(access & > + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | > + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); > > /* We assume the memory is from hugetlb until proved otherwise */ > umem->hugetlb = 1; > diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c > index c7a43624c96b..f9326ccda4b5 100644 > --- a/drivers/infiniband/core/uverbs_cmd.c > +++ b/drivers/infiniband/core/uverbs_cmd.c > @@ -953,6 +953,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, > goto err_free; > } > > + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { > + struct ib_device_attr attr; > + > + ret = ib_query_device(pd->device, &attr); > + if (ret || !(attr.device_cap_flags & > + IB_DEVICE_ON_DEMAND_PAGING)) { > + pr_debug("ODP support not available\n"); > + ret = -EINVAL; > + goto err_put; > + } > + } > + > mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, > cmd.access_flags, &udata); > if (IS_ERR(mr)) { > @@ -3289,6 +3301,19 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, > copy_query_dev_fields(file, &resp.base, &attr); > resp.comp_mask = 0; > > +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING > + if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { > + resp.odp_caps.general_caps = attr.odp_caps.general_caps; > + resp.odp_caps.per_transport_caps.rc_odp_caps = > + attr.odp_caps.per_transport_caps.rc_odp_caps; > + resp.odp_caps.per_transport_caps.uc_odp_caps = > + attr.odp_caps.per_transport_caps.uc_odp_caps; > + resp.odp_caps.per_transport_caps.ud_odp_caps = > + attr.odp_caps.per_transport_caps.ud_odp_caps; > + resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; > + } You need to clear the tail of the response otherwise, kernel will leak stack content to userspace: + #else /* !CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + resp.odp_caps.general_caps = 0; + resp.odp_caps.per_transport_caps.rc_odp_caps = 0; + resp.odp_caps.per_transport_caps.uc_odp_caps = 0; + resp.odp_caps.per_transport_caps.ud_odp_caps = 0; > +#endif > + + resp.odp_caps.reserved = 0 > err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); > if (err) > return err; > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 97a999f9e4d8..a41bc5a39ebf 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -123,7 +123,8 @@ enum ib_device_cap_flags { > IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), > IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), > IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), > - IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) > + IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), > + IB_DEVICE_ON_DEMAND_PAGING = (1<<31), > }; > > enum ib_signature_prot_cap { > @@ -143,6 +144,27 @@ enum ib_atomic_cap { > IB_ATOMIC_GLOB > }; > > +enum ib_odp_general_cap_bits { > + IB_ODP_SUPPORT = 1 << 0, > +}; > + > +enum ib_odp_transport_cap_bits { > + IB_ODP_SUPPORT_SEND = 1 << 0, > + IB_ODP_SUPPORT_RECV = 1 << 1, > + IB_ODP_SUPPORT_WRITE = 1 << 2, > + IB_ODP_SUPPORT_READ = 1 << 3, > + IB_ODP_SUPPORT_ATOMIC = 1 << 4, > +}; > + > +struct ib_odp_caps { > + uint64_t general_caps; > + struct { > + uint32_t rc_odp_caps; > + uint32_t uc_odp_caps; > + uint32_t ud_odp_caps; > + } per_transport_caps; > +}; > + > struct ib_device_attr { > u64 fw_ver; > __be64 sys_image_guid; > @@ -186,6 +208,7 @@ struct ib_device_attr { > u8 local_ca_ack_delay; > int sig_prot_cap; > int sig_guard_cap; > + struct ib_odp_caps odp_caps; > }; > > enum ib_mtu { > @@ -1073,7 +1096,8 @@ enum ib_access_flags { > IB_ACCESS_REMOTE_READ = (1<<2), > IB_ACCESS_REMOTE_ATOMIC = (1<<3), > IB_ACCESS_MW_BIND = (1<<4), > - IB_ZERO_BASED = (1<<5) > + IB_ZERO_BASED = (1<<5), > + IB_ACCESS_ON_DEMAND = (1<<6), > }; > > struct ib_phys_buf { > diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h > index e8a96071e352..4275b961bf60 100644 > --- a/include/uapi/rdma/ib_user_verbs.h > +++ b/include/uapi/rdma/ib_user_verbs.h > @@ -202,15 +202,30 @@ struct ib_uverbs_query_device_resp { > __u8 reserved[4]; > }; > > +enum { > + IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, > +}; > + > struct ib_uverbs_ex_query_device { > __u32 comp_mask; > __u32 reserved; > }; > > +struct ib_uverbs_odp_caps { > + __u64 general_caps; > + struct { > + __u32 rc_odp_caps; > + __u32 uc_odp_caps; > + __u32 ud_odp_caps; > + } per_transport_caps; > + __u32 reserved; > +}; > + > struct ib_uverbs_ex_query_device_resp { > struct ib_uverbs_query_device_resp base; > __u32 comp_mask; > __u32 reserved; > + struct ib_uverbs_odp_caps odp_caps; > }; Hopefully, no kernel was released with ib_uverbs_ex_query_device_resp without odp_caps (eg. in between '[PATCH v3 06/17] IB/core: Add support for extended query device caps' and this one, or ib_uverbs_ex_query_device() should have been modified to handle shorter ib_uverbs_ex_query_device_resp to accomodate the ABI variations. > > struct ib_uverbs_query_port { Regards. -- Yann Droneaud -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html