On 2021-10-06 4:55 p.m., Felix Kuehling
wrote:
Am 2021-10-06 um 10:32 a.m. schrieb Philip Yang:No function change, use pr_debug_ratelimited to avoid per page debug message overflowing dmesg buf and console log. use dev_err to show error message from unexpected situation, to provide clue to help debug without enabling dynamic debug log.AFAIK, dev_err does not print function and line-number information. So you probably need to provide a little more context in these messages. I think this could be done with a #define pr_fmt(fmt) "kfd_migrate: " fmt in kfd_migrate.c. I'll make a few more specific comments inline.
I will add below to output function name in error message after include head files, because dev_err uses dev_fmt and amdgpu.h overwrite dev_fmt. pr_debug can use dynamic debug control +pfl to output function name and line number, don't need define pr_fmt
#ifdef dev_fmt
#undef dev_fmt
#endif
#define dev_fmt(fmt) "kfd_migrate: %s: " fmt, __func__
agree, change to "fail %d to alloc vram\n"Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 ++++++++++++------------ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 ++++----- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f53e17a94ad8..069422337cf7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -151,14 +151,14 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, gart_d = svm_migrate_direct_mapping_addr(adev, *vram); } if (r) { - pr_debug("failed %d to create gart mapping\n", r); + dev_err(adev->dev, "fail %d create gart mapping\n", r); goto out_unlock; } r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE, NULL, &next, false, true, false); if (r) { - pr_debug("failed %d to copy memory\n", r); + dev_err(adev->dev, "fail %d to copy memory\n", r); goto out_unlock; } @@ -285,7 +285,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, r = svm_range_vram_node_new(adev, prange, true); if (r) { - pr_debug("failed %d get 0x%llx pages from vram\n", r, npages); + dev_err(adev->dev, "fail %d get %lld vram pages\n", r, npages);This message is misleading. svm_range_vram_node_new doesn't care about npages. It allocates memory for the whole range or reuses an existing allocation. So I'd drop the npages from the message.
done, keep pr_debuggoto out; } @@ -305,7 +305,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, DMA_TO_DEVICE); r = dma_mapping_error(dev, src[i]); if (r) { - pr_debug("failed %d dma_map_page\n", r); + dev_err(adev->dev, "fail %d dma_map_page\n", r); goto out_free_vram_pages; } } else { @@ -325,8 +325,8 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, continue; } - pr_debug("dma mapping src to 0x%llx, page_to_pfn 0x%lx\n", - src[i] >> PAGE_SHIFT, page_to_pfn(spage)); + pr_debug_ratelimited("dma mapping src to 0x%llx, pfn 0x%lx\n", + src[i] >> PAGE_SHIFT, page_to_pfn(spage)); if (j >= (cursor.size >> PAGE_SHIFT) - 1 && i < npages - 1) { r = svm_migrate_copy_memory_gart(adev, src + i - j, @@ -347,7 +347,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, out_free_vram_pages: if (r) { - pr_debug("failed %d to copy memory to vram\n", r); + dev_err(adev->dev, "fail %d to copy memory to vram\n", r);I think you only get here if svm_migrate_copy_memory_gart failed. That function already prints its own error messages, so this probably doesn't need to be a dev_err.
agree, keep pr_debugwhile (i--) { svm_migrate_put_vram_page(adev, dst[i]); migrate->dst[i] = 0; @@ -405,8 +405,8 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, r = migrate_vma_setup(&migrate); if (r) { - pr_debug("failed %d prepare migrate svms 0x%p [0x%lx 0x%lx]\n", - r, prange->svms, prange->start, prange->last); + dev_err(adev->dev, "fail %d vma setup 0x%p [0x%lx 0x%lx]\n", + r, prange->svms, prange->start, prange->last); goto out_free; } if (migrate.cpages != npages) { @@ -506,7 +506,7 @@ static void svm_migrate_page_free(struct page *page) struct svm_range_bo *svm_bo = page->zone_device_data; if (svm_bo) { - pr_debug("svm_bo ref left: %d\n", kref_read(&svm_bo->kref)); + pr_debug_ratelimited("ref: %d\n", kref_read(&svm_bo->kref)); svm_range_bo_unref(svm_bo); } } @@ -563,8 +563,8 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, dpage = svm_migrate_get_sys_page(migrate->vma, addr); if (!dpage) { - pr_debug("failed get page svms 0x%p [0x%lx 0x%lx]\n", - prange->svms, prange->start, prange->last); + dev_err(adev->dev, "fail get page 0x%p [0x%lx 0x%lx]\n", + prange->svms, prange->start, prange->last);The prange->svms pointer (or its hash) is pretty meaningless in an error message. It's OK in a debug message to correlate with other messages. But in an error message that's always enabled, I'd prefer a more readable ID. I think it basically stands for the process because svms is part of the kfd_process structure. prange->start/end are also not really meaningful for this failure. The page allocation failure doesn't depend on the prange start and end addresses. It's basically just an OOM error. I think Linux will be pretty noisy about OOM errors, so we probably don't need to add more messages about that here.
r = -ENOMEM; goto out_oom; } @@ -572,12 +572,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE); r = dma_mapping_error(dev, dst[i]); if (r) { - pr_debug("failed %d dma_map_page\n", r); + dev_err(adev->dev, "fail %d dma_map_page\n", r); goto out_oom; } - pr_debug("dma mapping dst to 0x%llx, page_to_pfn 0x%lx\n", - dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); + pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n", + dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); migrate->dst[i] = migrate_pfn(page_to_pfn(dpage)); migrate->dst[i] |= MIGRATE_PFN_LOCKED; @@ -631,8 +631,8 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, r = migrate_vma_setup(&migrate); if (r) { - pr_debug("failed %d prepare migrate svms 0x%p [0x%lx 0x%lx]\n", - r, prange->svms, prange->start, prange->last); + dev_err(adev->dev, "fail %d vma setup 0x%p [0x%lx 0x%lx]\n", + r, prange->svms, prange->start, prange->last); goto out_free; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 7c42a0d4e0de..7f0743fcfcc3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -158,17 +158,17 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, bo_adev->vm_manager.vram_base_offset - bo_adev->kfd.dev->pgmap.range.start; addr[i] |= SVM_RANGE_VRAM_DOMAIN; - pr_debug("vram address detected: 0x%llx\n", addr[i]); + pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); continue; } addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); r = dma_mapping_error(dev, addr[i]); if (r) { - pr_debug("failed %d dma_map_page\n", r); + pr_err("failed %d dma_map_page\n", r);Why not dev_err?
Change to dev_err in v2 patch.
Thanks,
Philip
Regards, Felixreturn r; } - pr_debug("dma mapping 0x%llx for page addr 0x%lx\n", - addr[i] >> PAGE_SHIFT, page_to_pfn(page)); + pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", + addr[i] >> PAGE_SHIFT, page_to_pfn(page)); } return 0; } @@ -217,7 +217,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, for (i = offset; i < offset + npages; i++) { if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) continue; - pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); + pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); dma_addr[i] = 0; } @@ -1459,7 +1459,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, /* This should never happen. actual_loc gets set by * svm_migrate_ram_to_vram after allocating a BO. */ - WARN(1, "VRAM BO missing during validation\n"); + WARN_ONCE(1, "VRAM BO missing during validation\n"); return -EINVAL; }