Re: [PATCH v4 08/11] cxl/region: Calculate performance data for a region

Dave Jiang <dave.jiang@xxxxxxxxx> · Wed, 31 Jan 2024 08:56:41 -0700

On 1/30/24 19:22, Wonjae Lee wrote:
> On Fri, Jan 19, 2024 at 10:23:52AM -0700, Dave Jiang wrote:
>> Calculate and store the performance data for a CXL region. Find the worst
>> read and write latency for all the included ranges from each of the devices
>> that attributes to the region and designate that as the latency data. Sum
>> all the read and write bandwidth data for each of the device region and
>> that is the total bandwidth for the region.
>>
>> The perf list is expected to be constructed before the endpoint decoders
>> are registered and thus there should be no early reading of the entries
>> from the region assemble action. The calling of the region qos calculate
>> function is under the protection of cxl_dpa_rwsem and will ensure that
>> all DPA associated work has completed.
>>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>
>> Signed-off-by: Dave Jiang <dave.jiang@xxxxxxxxx>
>> ---
>> v4:
>> - Calculate access classes 0 and 1 by retrieving host bridge coords
>> - Add lockdep assert for cxl_dpa_rwsem (Dan)
>> - Clarify that HMAT code is HMEM_REPORTING code. (Dan)
>> ---
>>   drivers/cxl/core/cdat.c    74 +++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/cxl/core/region.c     2 +
>>   drivers/cxl/cxl.h            4 ++
>>   3 files changed, 80 insertions(+)
>>
>> diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
>> index 6e3998723aaa..7acb5837afad 100644
>> --- a/drivers/cxl/core/cdat.c
>> +++ b/drivers/cxl/core/cdat.c
>> @@ -8,6 +8,7 @@
>>   #include "cxlpci.h"
>>   #include "cxlmem.h"
>>   #include "cxl.h"
>> +#include "core.h"
>>
>>   struct dsmas_entry {
>>   struct range dpa_range;
>> @@ -546,3 +547,76 @@ void cxl_coordinates_combine(struct access_coordinate *out,
>>   EXPORT_SYMBOL_NS_GPL(cxl_coordinates_combine, CXL);
>>
>>   MODULE_IMPORT_NS(CXL);
>> +
>> +void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>> +                struct cxl_endpoint_decoder *cxled)
>> +{
>> + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>> + struct cxl_port *port = cxlmd->endpoint;
>> + struct cxl_dev_state *cxlds = cxlmd->cxlds;
>> + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
>> + struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX];
>> + struct access_coordinate coord;
>> + struct range dpa = {
>> +        .start = cxled->dpa_res->start,
>> +        .end = cxled->dpa_res->end,
>> + };
>> + struct list_head *perf_list;
>> + struct cxl_dpa_perf *perf;
>> + bool found = false;
>> + int rc;
>> +
>> + switch (cxlr->mode) {
>> + case CXL_DECODER_RAM:
>> +    perf_list = &mds->ram_perf_list;
>> +    break;
>> + case CXL_DECODER_PMEM:
>> +    perf_list = &mds->pmem_perf_list;
>> +    break;
>> + default:
>> +    return;
>> + }
>> +
>> + lockdep_assert_held(&cxl_dpa_rwsem);
>> +
>> + list_for_each_entry(perf, perf_list, list) {
>> +    if (range_contains(&perf->dpa_range, &dpa)) {
>> +        found = true;
>> +        break;
>> +    }
>> + }
>> +
>> + if (!found)
>> +    return;
>> +
>> + rc = cxl_hb_get_perf_coordinates(port, hb_coord);
>> + if (rc)  {
>> +    dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n");
>> +    return;
>> + }
>> +
>> + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
>> +    /* Pickup the host bridge coords */
>> +    cxl_coordinates_combine(&coord, &hb_coord[i], &perf->coord);
>> +
>> +    /* Get total bandwidth and the worst latency for the cxl region */
>> +    cxlr->coord[i].read_latency = max_t(unsigned int,
>> +                        cxlr->coord[i].read_latency,
>> +                        coord.read_latency);
>> +    cxlr->coord[i].write_latency = max_t(unsigned int,
>> +                          cxlr->coord[i].write_latency,
>> +                          coord.write_latency);
>> +    cxlr->coord[i].read_bandwidth += coord.read_bandwidth;
>> +    cxlr->coord[i].write_bandwidth += coord.write_bandwidth;
>> +
>> +    /*
>> +      * Convert latency to nanosec from picosec to be consistent
>> +      * with the resulting latency coordinates computed by the
>> +      * HMAT_REPORTING code.
>> +      */
>> +    cxlr->coord[i].read_latency =
>> +        DIV_ROUND_UP(cxlr->coord[i].read_latency, 1000);
>> +    cxlr->coord[i].write_latency =
>> +        DIV_ROUND_UP(cxlr->coord[i].write_latency, 1000);
> 
> Hello,
> 
> I ran into a bit of confusion and have a question while validating CDAT
> behaviour with physical CXL devices. (I'm not sure if this is the right
> thread to ask this question, sorry if it isn't.)
> 
> IIUC, the raw data of latency is in picosec, but the comments on the
> struct access_coordinate say that the latency units are in nanosec:
>  * @read_latency:   Read latency in nanoseconds
>  * @write_latency:  Write latency in nanoseconds
> 
> This was a bit confusing at first, as the raw data of latency are in
> ps, and the structure that stores the latency expects units of ns.

Right. The numbers stored with the HMAT_REPORTING code and eventually NUMA nodes are normalized to nanoseconds, even though the raw data is in picoseconds. For CXL, I left the CDAT and computed numbers as raw numbers (picoseconds) until the final step when I calculate the latency for the entire region. And then it gets converted to nanoseconds in order to write back to the memory_target for HMAT_REPORTING. The numbers we retrieve from HMAT_REPORTING for the generic target is already in nanoseconds.

> 
> I saw that you have already had a discussion with Brice about the
> pico/nanosecond unit conversion. My question is, are there any plans to
> store latency number of cxl port in nanoseconds or change the comments
> of coords structure?

The numbers for the coords structure will remain in nanoseconds as it always have been. 

> 
> Thanks,
> Wonjae
> 
>> + }
>> +}
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 57a5901d5a60..7f19b533c5ae 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -1722,6 +1722,8 @@ static int cxl_region_attach(struct cxl_region *cxlr,
>>       return -EINVAL;
>>   }
>>
>> + cxl_region_perf_data_calculate(cxlr, cxled);
>> +
>>   if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
>>       int i;
>>
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index 80e6bd294e18..f6637fa33113 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -519,6 +519,7 @@ struct cxl_region_params {
>>   * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
>>   * @flags: Region state flags
>>   * @params: active + config params for the region
>> + * @coord: QoS access coordinates for the region
>>   */
>>   struct cxl_region {
>>   struct device dev;
>> @@ -529,6 +530,7 @@ struct cxl_region {
>>   struct cxl_pmem_region *cxlr_pmem;
>>   unsigned long flags;
>>   struct cxl_region_params params;
>> + struct access_coordinate coord[ACCESS_COORDINATE_MAX];
>>   };
>>
>>   struct cxl_nvdimm_bridge {
>> @@ -880,6 +882,8 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port,
>>                     struct access_coordinate *coord);
>>   int cxl_hb_get_perf_coordinates(struct cxl_port *port,
>>               struct access_coordinate *coord);
>> +void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
>> +                struct cxl_endpoint_decoder *cxled);
>>
>>   void cxl_coordinates_combine(struct access_coordinate *out,
>>                 struct access_coordinate *c1,
>>
>>
>>