On 2021-04-28 5:00 a.m., Christian
König wrote:
Am 28.04.21 um 09:53 schrieb Felix Kuehling:
Am 2021-04-28 um 2:54 a.m. schrieb Christian König:
Am 27.04.21 um 20:21 schrieb Felix Kuehling:I find this a mix of multiplication, division and bit-shift more
On 2021-04-27 10:51 a.m., Philip Yang wrote:Maybe changing this so that we have "addr * ((1 << 16) /
Add interface to remove address from fault filter ring by resettingThis comment is misleading. The PASID is not 4-bit. It's 16-bit. But
fault ring entry key, then future vm fault on the address will be
processed to recover.
Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
to protect fault ring access by interrupt handler and interrupt
deferred
work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
with 8-bit fault->next, it is enough for 48bit IH timestamp.
Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48
++++++++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 ++--
2 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c39ed9eb0987..a2e81e913abb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
amdgpu_device *adev, struct amdgpu_gmc *mc)
mc->agp_size >> 20, mc->agp_start, mc->agp_end);
}
+/**
+ * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
+ *
+ * @addr: 48bit physical address
+ * @pasid: 4 bit
shifting the address by 4 bit is sufficient because the address is
page-aligned, which means the low 12 bits are 0. So this would be
more accurate:
@addr: 48 bit physical address, page aligned (36 significant bits)
@pasid: 16 bit process address space identifier
With that fixed, the patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
+ */
+static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t
pasid)
+{
+ return addr << 4 | pasid;
+}
AMDGPU_PAGE_SIZE) | pasid" would help to better document that?
confusing. How about "addr << (16 - AMDGPU_GPU_PAGE_SHIFT) | pasid"?
Yeah, that is even better.
I have pushed the patch yesterday, didn't pick this, should wait longer before push in future.
Thanks,
Philip
Regards,
Christian.
Regards,
Felix
Christian.
+_______________________________________________
/**
* amdgpu_gmc_filter_faults - filter VM faults
*
@@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
uint16_t pasid, uint64_t timestamp)
{
struct amdgpu_gmc *gmc = &adev->gmc;
-
- uint64_t stamp, key = addr << 4 | pasid;
+ uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
struct amdgpu_gmc_fault *fault;
uint32_t hash;
@@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
while (fault->timestamp >= stamp) {
uint64_t tmp;
- if (fault->key == key)
+ if (atomic64_read(&fault->key) == key)
return true;
tmp = fault->timestamp;
@@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
/* Add the fault to the ring */
fault = &gmc->fault_ring[gmc->last_fault];
- fault->key = key;
+ atomic64_set(&fault->key, key);
fault->timestamp = timestamp;
/* And update the hash */
@@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
return false;
}
+/**
+ * amdgpu_gmc_filter_faults_remove - remove address from VM faults
filter
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ *
+ * Remove the address from fault filter, then future vm fault on
this address
+ * will pass to retry fault handler to recover.
+ */
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
uint64_t addr,
+ uint16_t pasid)
+{
+ struct amdgpu_gmc *gmc = &adev->gmc;
+ uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
+ struct amdgpu_gmc_fault *fault;
+ uint32_t hash;
+ uint64_t tmp;
+
+ hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+ fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+ do {
+ if (atomic64_cmpxchg(&fault->key, key, 0) == key)
+ break;
+
+ tmp = fault->timestamp;
+ fault = &gmc->fault_ring[fault->next];
+ } while (fault->timestamp < tmp);
+}
+
int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
{
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 9d11c02a3938..6aa1d52d3aee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -66,9 +66,9 @@ struct firmware;
* GMC page fault information
*/
struct amdgpu_gmc_fault {
- uint64_t timestamp;
+ uint64_t timestamp:48;
uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER;
- uint64_t key:52;
+ atomic64_t key;
};
/*
@@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
amdgpu_device *adev,
struct amdgpu_gmc *mc);
bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
addr,
uint16_t pasid, uint64_t timestamp);
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
uint64_t addr,
+ uint16_t pasid);
int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://nam11.safelinks.protection.outlook.com/?url="">
_______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx