Jonathan Cameron wrote: > On Fri, 16 Aug 2024 09:44:26 -0500 > ira.weiny@xxxxxxxxx wrote: > > > From: Navneet Singh <navneet.singh@xxxxxxxxx> > > [snip] > > +static int match_contains(struct device *dev, void *data) > > +{ > > + struct region_extent *region_extent = to_region_extent(dev); > > + struct match_data *md = data; > > + struct cxled_extent *entry; > > + unsigned long index; > > + > > + if (!region_extent) > > + return 0; > > + > > + xa_for_each(®ion_extent->decoder_extents, index, entry) { > > + if (md->cxled == entry->cxled && > > + range_contains(&entry->dpa_range, md->new_range)) > > + return true; > As below, this returns int, so shouldn't be true or false. Yep. Thanks. > > > + } > > + return false; > > +} > > > +static int match_overlaps(struct device *dev, void *data) > > +{ > > + struct region_extent *region_extent = to_region_extent(dev); > > + struct match_data *md = data; > > + struct cxled_extent *entry; > > + unsigned long index; > > + > > + if (!region_extent) > > + return 0; > > + > > + xa_for_each(®ion_extent->decoder_extents, index, entry) { > > + if (md->cxled == entry->cxled && > > + range_overlaps(&entry->dpa_range, md->new_range)) > > + return true; > > returns int, so returning true or false is odd. Yep. > > > + } > > + > > + return false; > > +} > > > > +int cxl_rm_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent) > > +{ > > + u64 start_dpa = le64_to_cpu(extent->start_dpa); > > + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; > > + struct cxl_endpoint_decoder *cxled; > > + struct range hpa_range, dpa_range; > > + struct cxl_region *cxlr; > > + > > + dpa_range = (struct range) { > > + .start = start_dpa, > > + .end = start_dpa + le64_to_cpu(extent->length) - 1, > > + }; > > + > > + guard(rwsem_read)(&cxl_region_rwsem); > > + cxlr = cxl_dpa_to_region(cxlmd, start_dpa, &cxled); > > + if (!cxlr) { > > + memdev_release_extent(mds, &dpa_range); > > How does this condition happen? Perhaps a comment needed. Fair enough. Proposed comment. /* * No region can happen here for a few reasons: * * 1) Extents were accepted and the host crashed/rebooted * leaving them in an accepted state. On reboot the host * has not yet created a region to own them. * * 2) Region destruction won the race with the device releasing * all the extents. Here the release will be a duplicate of * the one sent via region destruction. * * 3) The device is confused and releasing extents for which no * region ever existed. * * In all these cases make sure the device knows we are not * using this extent. */ Item 2 is AFAICS ok with the spec. > > > + return -ENXIO; > > + } > > + > > + calc_hpa_range(cxled, cxlr->cxlr_dax, &dpa_range, &hpa_range); > > + > > + /* Remove region extents which overlap */ > > + return device_for_each_child(&cxlr->cxlr_dax->dev, &hpa_range, > > + cxlr_rm_extent); > > +} > > + > > +static int cxlr_add_extent(struct cxl_dax_region *cxlr_dax, > > + struct cxl_endpoint_decoder *cxled, > > + struct cxled_extent *ed_extent) > > +{ > > + struct region_extent *region_extent; > > + struct range hpa_range; > > + int rc; > > + > > + calc_hpa_range(cxled, cxlr_dax, &ed_extent->dpa_range, &hpa_range); > > + > > + region_extent = alloc_region_extent(cxlr_dax, &hpa_range, ed_extent->tag); > > + if (IS_ERR(region_extent)) > > + return PTR_ERR(region_extent); > > + > > + rc = xa_insert(®ion_extent->decoder_extents, (unsigned long)ed_extent, ed_extent, > > I'd wrap that earlier to keep the line a bit shorter. Done. > > > + GFP_KERNEL); > > + if (rc) { > > + free_region_extent(region_extent); > > + return rc; > > + } > > + > > + /* device model handles freeing region_extent */ > > + return online_region_extent(region_extent); > > +} > > + > > +/* Callers are expected to ensure cxled has been attached to a region */ > > +int cxl_add_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent) > > +{ > > + u64 start_dpa = le64_to_cpu(extent->start_dpa); > > + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; > > + struct cxl_endpoint_decoder *cxled; > > + struct range ed_range, ext_range; > > + struct cxl_dax_region *cxlr_dax; > > + struct cxled_extent *ed_extent; > > + struct cxl_region *cxlr; > > + struct device *dev; > > + > > + ext_range = (struct range) { > > + .start = start_dpa, > > + .end = start_dpa + le64_to_cpu(extent->length) - 1, > > + }; > > + > > + guard(rwsem_read)(&cxl_region_rwsem); > > + cxlr = cxl_dpa_to_region(cxlmd, start_dpa, &cxled); > > + if (!cxlr) > > + return -ENXIO; > > + > > + cxlr_dax = cxled->cxld.region->cxlr_dax; > > + dev = &cxled->cxld.dev; > > + ed_range = (struct range) { > > + .start = cxled->dpa_res->start, > > + .end = cxled->dpa_res->end, > > + }; > > + > > + dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent %par\n", > > + cxled->dpa_res, &ext_range); > > + > > + if (!range_contains(&ed_range, &ext_range)) { > > + dev_err_ratelimited(dev, > > + "DC extent DPA %par (%*phC) is not fully in ED %par\n", > > + &ext_range.start, CXL_EXTENT_TAG_LEN, > > + extent->tag, &ed_range); > > + return -ENXIO; > > + } > > + > > + if (extents_contain(cxlr_dax, cxled, &ext_range)) > > This case confuses me. If the extents are already there I think we should > error out or at least print something as that's very wrong. I thought we discussed this in one of the community meetings that it would be ok to accept these. We could certainly print a warning here. In all honestly I'm wondering if these restrictions are really needed anymore. But at the same time I really, really, really don't think anyone has a good use case to have to support these cases. So I'm keeping the code simple for now. > > > + return 0; > > + > > + if (extents_overlap(cxlr_dax, cxled, &ext_range)) > > + return -ENXIO; > > + > > + ed_extent = kzalloc(sizeof(*ed_extent), GFP_KERNEL); > > + if (!ed_extent) > > + return -ENOMEM; > > + > > + ed_extent->cxled = cxled; > > + ed_extent->dpa_range = ext_range; > > + memcpy(ed_extent->tag, extent->tag, CXL_EXTENT_TAG_LEN); > > + > > + dev_dbg(dev, "Add extent %par (%*phC)\n", &ed_extent->dpa_range, > > + CXL_EXTENT_TAG_LEN, ed_extent->tag); > > + > > + return cxlr_add_extent(cxlr_dax, cxled, ed_extent); > > +} > > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > > index 01a447aaa1b1..f629ad7488ac 100644 > > --- a/drivers/cxl/core/mbox.c > > +++ b/drivers/cxl/core/mbox.c > > @@ -882,6 +882,48 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds) > > } > > EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); > > > > +static int cxl_validate_extent(struct cxl_memdev_state *mds, > > + struct cxl_extent *extent) > > +{ > > + u64 start = le64_to_cpu(extent->start_dpa); > > + u64 length = le64_to_cpu(extent->length); > > + struct device *dev = mds->cxlds.dev; > > + > > + struct range ext_range = (struct range){ > > + .start = start, > > + .end = start + length - 1, > > + }; > > + > > + if (le16_to_cpu(extent->shared_extn_seq) != 0) { > > That's not the 'main' way to tell if an extent is shared because > we could have a single extent (so seq == 0). > Should verify it's not in a DCD region that > is shareable to make this decision. Ah... :-/ > > I've lost track on the region handling so maybe you already do > this by not including those regions at all? I don't think so. I'll add the region check. I see now why I glossed over this though. The shared nature of a DCD partition is defined in the DSMAS. Is that correct? Or am I missing something in the spec? > > > + dev_err_ratelimited(dev, > > + "DC extent DPA %par (%*phC) can not be shared\n", > > + &ext_range.start, CXL_EXTENT_TAG_LEN, > > + extent->tag); > > + return -ENXIO; > > + } > > + > > + /* Extents must not cross DC region boundary's */ > > + for (int i = 0; i < mds->nr_dc_region; i++) { > > + struct cxl_dc_region_info *dcr = &mds->dc_region[i]; > > + struct range region_range = (struct range) { > > + .start = dcr->base, > > + .end = dcr->base + dcr->decode_len - 1, > > + }; > > + > > + if (range_contains(®ion_range, &ext_range)) { > > + dev_dbg(dev, "DC extent DPA %par (DCR:%d:%#llx)(%*phC)\n", > > + &ext_range, i, start - dcr->base, > > + CXL_EXTENT_TAG_LEN, extent->tag); > > + return 0; > > + } > > + } > > + > > + dev_err_ratelimited(dev, > > + "DC extent DPA %par (%*phC) is not in any DC region\n", > > + &ext_range, CXL_EXTENT_TAG_LEN, extent->tag); > > + return -ENXIO; > > +} > > + > > void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > > enum cxl_event_log_type type, > > enum cxl_event_type event_type, > > @@ -1009,6 +1051,207 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, > > return rc; > > } > > > > +static int cxl_send_dc_response(struct cxl_memdev_state *mds, int opcode, > > + struct xarray *extent_array, int cnt) > > +{ > > + struct cxl_mbox_dc_response *p; > > + struct cxl_mbox_cmd mbox_cmd; > > + struct cxl_extent *extent; > > + unsigned long index; > > + u32 pl_index; > > + int rc = 0; > > + > > + size_t pl_size = struct_size(p, extent_list, cnt); > > + u32 max_extents = cnt; > > + > What is cnt is zero? All extents rejected so none in the > extent_array. Need to send a zero extent response to reject > them all IIRC. yes. I missed that thanks. > > > + /* May have to use more bit on response. */ > > + if (pl_size > mds->payload_size) { > > + max_extents = (mds->payload_size - sizeof(*p)) / > > + sizeof(struct updated_extent_list); > > + pl_size = struct_size(p, extent_list, max_extents); > > + } > > + > > + struct cxl_mbox_dc_response *response __free(kfree) = > > + kzalloc(pl_size, GFP_KERNEL); > > + if (!response) > > + return -ENOMEM; > > + > > + pl_index = 0; > > + xa_for_each(extent_array, index, extent) { > > + > > + response->extent_list[pl_index].dpa_start = extent->start_dpa; > > + response->extent_list[pl_index].length = extent->length; > > + pl_index++; > > + response->extent_list_size = cpu_to_le32(pl_index); > > + > > + if (pl_index == max_extents) { > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = opcode, > > + .size_in = struct_size(response, extent_list, > > + pl_index), > > + .payload_in = response, > > + }; > > + > > + response->flags = 0; > > + if (pl_index < cnt) > > + response->flags &= CXL_DCD_EVENT_MORE; > > + > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > > + if (rc) > > + return rc; > > + pl_index = 0; > > + } > > + } > > + > > + if (pl_index) { > || !cnt > > I think so we send a nothing accepted message. Yep. > > > + mbox_cmd = (struct cxl_mbox_cmd) { > > + .opcode = opcode, > > + .size_in = struct_size(response, extent_list, > > + pl_index), > > + .payload_in = response, > > + }; > > + > > + response->flags = 0; > > + rc = cxl_internal_send_cmd(mds, &mbox_cmd); > if (rc) > return rc; > > + } > > + > > return 0; So that reader doesn't have to check what rc was in !pl_index > case and avoids assigning rc right at the top. Ah thanks. That might have been left over from something previous. > > > > + return rc; > > +} > > > > +static int cxl_add_pending(struct cxl_memdev_state *mds) > > +{ > > + struct device *dev = mds->cxlds.dev; > > + struct cxl_extent *extent; > > + unsigned long index; > > + unsigned long cnt = 0; > > + int rc; > > + > > + xa_for_each(&mds->pending_extents, index, extent) { > > + if (validate_add_extent(mds, extent)) { > > > Add a comment here that not accepting an extent but > accepting some or none means this one was rejected (I'd forgotten how > that bit worked) Ok yeah that may not be clear without reading the spec closely. /* * Any extents which are to be rejected are omitted from * the response. An empty response means all are * rejected. */ > > > + dev_dbg(dev, "unconsumed DC extent DPA:%#llx LEN:%#llx\n", > > + le64_to_cpu(extent->start_dpa), > > + le64_to_cpu(extent->length)); > > + xa_erase(&mds->pending_extents, index); > > + kfree(extent); > > + continue; > > + } > > + cnt++; > > + } > > + rc = cxl_send_dc_response(mds, CXL_MBOX_OP_ADD_DC_RESPONSE, > > + &mds->pending_extents, cnt); > > + xa_for_each(&mds->pending_extents, index, extent) { > > + xa_erase(&mds->pending_extents, index); > > + kfree(extent); > > + } > > + return rc; > > +} > > + > > +static int handle_add_event(struct cxl_memdev_state *mds, > > + struct cxl_event_dcd *event) > > +{ > > + struct cxl_extent *tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); > > + struct device *dev = mds->cxlds.dev; > > + > > + if (!tmp) > > + return -ENOMEM; > > + > > + memcpy(tmp, &event->extent, sizeof(*tmp)); > > kmemdup? yep. > > > + if (xa_insert(&mds->pending_extents, (unsigned long)tmp, tmp, > > + GFP_KERNEL)) { > > + kfree(tmp); > > + return -ENOMEM; > > + } > > + > > + if (event->flags & CXL_DCD_EVENT_MORE) { > > + dev_dbg(dev, "more bit set; delay the surfacing of extent\n"); > > + return 0; > > + } > > + > > + /* extents are removed and free'ed in cxl_add_pending() */ > > + return cxl_add_pending(mds); > > +} > > > static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > > enum cxl_event_log_type type) > > { > > @@ -1044,9 +1287,17 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, > > if (!nr_rec) > > break; > > > > - for (i = 0; i < nr_rec; i++) > > + for (i = 0; i < nr_rec; i++) { > > __cxl_event_trace_record(cxlmd, type, > > &payload->records[i]); > > + if (type == CXL_EVENT_TYPE_DCD) { > Bit of a deep indent so maybe flip logic? > > Logic wise it's a bit dubious as we might want to match other > types in future though so up to you. I was thinking more along these lines. But the rc is unneeded. That print can be in the handle function. Something like this: diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 88b823afe482..e86a483d80eb 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1231,16 +1231,17 @@ static char *cxl_dcd_evt_type_str(u8 type) return "<unknown>"; } -static int cxl_handle_dcd_event_records(struct cxl_memdev_state *mds, +static void cxl_handle_dcd_event_records(struct cxl_memdev_state *mds, struct cxl_event_record_raw *raw_rec) { struct cxl_event_dcd *event = &raw_rec->event.dcd; struct cxl_extent *extent = &event->extent; struct device *dev = mds->cxlds.dev; uuid_t *id = &raw_rec->id; + int rc; if (!uuid_equal(id, &CXL_EVENT_DC_EVENT_UUID)) - return -EINVAL; + return; dev_dbg(dev, "DCD event %s : DPA:%#llx LEN:%#llx\n", cxl_dcd_evt_type_str(event->event_type), @@ -1248,15 +1249,22 @@ static int cxl_handle_dcd_event_records(struct cxl_memdev_state *mds, switch (event->event_type) { case DCD_ADD_CAPACITY: - return handle_add_event(mds, event); + rc = handle_add_event(mds, event); + break; case DCD_RELEASE_CAPACITY: - return cxl_rm_extent(mds, &event->extent); + rc = cxl_rm_extent(mds, &event->extent); + break; case DCD_FORCED_CAPACITY_RELEASE: dev_err_ratelimited(dev, "Forced release event ignored.\n"); - return 0; + rc = 0; + break; default: - return -EINVAL; + rc = -EINVAL; + break; } + + if (rc) + dev_err_ratelimited(dev, "dcd event failed: %d\n", rc); } static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, @@ -1297,13 +1305,9 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, for (i = 0; i < nr_rec; i++) { __cxl_event_trace_record(cxlmd, type, &payload->records[i]); - if (type == CXL_EVENT_TYPE_DCD) { - rc = cxl_handle_dcd_event_records(mds, - &payload->records[i]); - if (rc) - dev_err_ratelimited(dev, "dcd event failed: %d\n", - rc); - } + if (type == CXL_EVENT_TYPE_DCD) + cxl_handle_dcd_event_records(mds, + &payload->records[i]); } if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW) <end diff> > > if (type != CXL_EVENT_TYPE_DCD) > continue; > > rc = > > > + rc = cxl_handle_dcd_event_records(mds, > > + &payload->records[i]); > > + if (rc) > > + dev_err_ratelimited(dev, "dcd event failed: %d\n", > > + rc); > > + } > > + } > > > > > struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) > > { > > struct cxl_memdev_state *mds; > > @@ -1628,6 +1892,8 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) > > mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; > > mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; > > mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; > > + xa_init(&mds->pending_extents); > > + devm_add_action_or_reset(dev, clear_pending_extents, mds); > > Why don't you need to check if this failed? Definitely seems unlikely > to leave things in a good state. Unlikely to fail of course, but you never know. yea good catch. > > > > > return mds; > > } > > > @@ -3090,6 +3091,8 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) > > > > dev = &cxlr_dax->dev; > > cxlr_dax->cxlr = cxlr; > > + cxlr->cxlr_dax = cxlr_dax; > > + ida_init(&cxlr_dax->extent_ida); > > device_initialize(dev); > > lockdep_set_class(&dev->mutex, &cxl_dax_region_key); > > device_set_pm_not_required(dev); > > @@ -3190,7 +3193,10 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > > static void cxlr_dax_unregister(void *_cxlr_dax) > > { > > struct cxl_dax_region *cxlr_dax = _cxlr_dax; > > + struct cxl_region *cxlr = cxlr_dax->cxlr; > > > > + cxlr->cxlr_dax = NULL; > > + cxlr_dax->cxlr = NULL; > > cxlr_dax->cxlr was assigned before this patch. > > I'm not seeing any new checks on these being non null so why > are the needed? If there is a good reason for this then > a comment would be useful. I'm not sure anymore either. Perhaps this was left over from an earlier version. Or was something I thought I would need that ended up getting removed. I'll test without this hunk and remove it if I can. Thanks for the review, Ira [snip]