Re: [PATCH] drm/amdgpu: fix return ramdom value when multiple threads read registers via mes.

Christian König <christian.koenig@xxxxxxx> · Mon, 4 Nov 2024 11:22:09 +0100



    Am 04.11.24 um 07:43 schrieb Li, Chong(Alan):

    
        [AMD Official Use Only - AMD Internal Distribution Only]

      
          The currect code use the address
            "adev->mes.read_val_ptr" to store the value read from
            register via mes.
          So when multiple threads read
            register,
          multiple threads have to share the one
            address, and overwrite the value each other.
        
      
    Good catch.

    
          Assign an address by
            "amdgpu_device_wb_get" to store register value.
          each thread will has an address to
            store register value.
           
          Signed-off-by: Chong Li <chongli2@xxxxxxx>
          ---
          drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
            | 30 +++++++++++-------------- 
            drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 ---
          2 files changed, 13 insertions(+), 20
            deletions(-)
           
          diff --git
            a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
          index 83d0f731fb65..d74e3507e155
            100644
          ---
            a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
          +++
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
          @@ -189,17 +189,6 @@ int
            amdgpu_mes_init(struct amdgpu_device *adev)
                                            
            (uint64_t
            *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]];
                     }
          
          -           r =
            amdgpu_device_wb_get(adev, &adev->mes.read_val_offs);
          -           if (r) {
          -                      
            dev_err(adev->dev,
          -                                  
            "(%d) read_val_offs alloc failed\n", r);
          -                       goto error;
          -           }
          -          
            adev->mes.read_val_gpu_addr =
          -                      
            adev->wb.gpu_addr + (adev->mes.read_val_offs * 4);
          -           adev->mes.read_val_ptr
            =
          -                       (uint32_t
            *)&adev->wb.wb[adev->mes.read_val_offs];
          -
                     r =
            amdgpu_mes_doorbell_init(adev);
                     if (r)
                                 goto error;
          @@ -220,8 +209,6 @@ int
            amdgpu_mes_init(struct amdgpu_device *adev)
                                            
            amdgpu_device_wb_free(adev,
                                                        
                  adev->mes.query_status_fence_offs[i]);
                     }
          -           if
            (adev->mes.read_val_ptr)
          -                      
            amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
          
                     
            idr_destroy(&adev->mes.pasid_idr);
                    
            idr_destroy(&adev->mes.gang_id_idr);
          @@ -246,8 +233,6 @@ void
            amdgpu_mes_fini(struct amdgpu_device *adev)
                                            
            amdgpu_device_wb_free(adev,
                                                        
                  adev->mes.query_status_fence_offs[i]);
                     }
          -           if
            (adev->mes.read_val_ptr)
          -                      
            amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
          
                     
            amdgpu_mes_doorbell_free(adev);
          
          @@ -918,10 +903,19 @@ uint32_t
            amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)  {
                     struct mes_misc_op_input
            op_input;
                     int r, val = 0;
          +          uint32_t addr_offset = 0;
          +          uint64_t read_val_gpu_addr
            = 0;
          +          uint32_t *read_val_ptr =
            NULL;
          
          +          if
            (amdgpu_device_wb_get(adev, &addr_offset)) {
          +                     
            DRM_ERROR("critical bug! too many mes readers\n");
          +                      goto error;
          +          }
          +          read_val_gpu_addr =
            adev->wb.gpu_addr + (addr_offset * 4);
          +          read_val_ptr = (uint32_t
            *)&adev->wb.wb[addr_offset];
        
      
    Please run checkpatch.pl on the patch since this code here clearly
    has style issues.

    
    Apart from that looks good to me, the only potential concern I can
    see is if we have enough writeback memory to cover all concurrent
    threads.

    
    Regards,

    Christian.

    
                     op_input.op =
            MES_MISC_OP_READ_REG;
                    
            op_input.read_reg.reg_offset = reg;
          -          
            op_input.read_reg.buffer_addr =
            adev->mes.read_val_gpu_addr;
          +         
            op_input.read_reg.buffer_addr = read_val_gpu_addr;
          
                      if
            (!adev->mes.funcs->misc_op) {
                                 DRM_ERROR("mes
            rreg is not supported!\n"); @@ -932,9 +926,11 @@ uint32_t
            amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
                     if (r)
                                
            DRM_ERROR("failed to read reg (0x%x)\n", reg);
                     else
          -                       val =
            *(adev->mes.read_val_ptr);
          +                      val =
            *(read_val_ptr);
          
           error:
          +          if (addr_offset)
          +                     
            amdgpu_device_wb_free(adev, addr_offset);
                     return val;
          }
          
          diff --git
            a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
          index 45e3508f0f8e..83f45bb48427
            100644
          ---
            a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
          +++
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
          @@ -119,9 +119,6 @@ struct amdgpu_mes
            {
                    
            uint32_t                                  
            query_status_fence_offs[AMDGPU_MAX_MES_PIPES];
                    
            uint64_t                                  
            query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES];
                    
            uint64_t                                  
            *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES];
          -          
            uint32_t                        read_val_offs;
          -          
            uint64_t                                  
            read_val_gpu_addr;
          -          
            uint32_t                                   *read_val_ptr;
          
                     
            uint32_t                                   saved_flags;
          
          --
          2.34.1
           
           
              From:
                  Koenig, Christian <Christian.Koenig@xxxxxxx>
                  

                  Sent: Thursday, October 31, 2024 6:04 PM

                  To: Li, Chong(Alan) <Chong.Li@xxxxxxx>

                  Cc: cao, lin <lin.cao@xxxxxxx>; Yin,
                  ZhenGuo (Chris) <ZhenGuo.Yin@xxxxxxx>; Zhang,
                  Tiantian (Celine) <Tiantian.Zhang@xxxxxxx>;
                  Andjelkovic, Dejan <Dejan.Andjelkovic@xxxxxxx>;
                  amd-gfx@xxxxxxxxxxxxxxxxxxxxx

                  Subject: Re: [PATCH] drm/amd/amdgpu: change the
                  flush gpu tlb mode to sync mode.
            
          
          Hi Chong,

            
            Am 31.10.24 um 10:54 schrieb Li, Chong(Alan):

            
            [AMD
                Official Use Only - AMD Internal Distribution Only]
             
            
              Hi,
                  Christian.
               
               
              Share
                  the process of the page fault issue in rocblas
                  benchmark.
            
          
            finally some progress here. Thanks for the update.

            
              Find
                  when there are multithreads read register
                  “regIH_VMID_0_LUT” to get pasid,
                
              This
                  register will return error pasid value randomly,
                  sometimes is 0, sometimes is 32768, (the real value is
                  32770).
              After
                  check the invalid pasid, code will “continue” and not
                  flush the gpu tlb.
            
          
            That is really disturbing, concurrent register access is
            mandatory to work correctly.

            
            Not only the TLB flush but many other operations depend on
            stuff like that as well.

            
              That’s
                  why the page fault accours.
               
              After
                  add the lock, the register not return invalid value,
                  and the rocblas benchmark passed.
               
              You
                  have submit a patch "implement TLB flush fence", in
                  this patch you create a kernel thread to flush gpu
                  tlb.
              And in
                  main thread the function “svm_range_map_to_gpus” will
                  call function “kfd_flush_tlb” and then flush gpu tlb
                  as well.
              Means
                  that both the two threads will call function
                  “gmc_v11_0_flush_gpu_tlb_pasid”.
              So
                  after you merge your patch, the page fault issue
                  accours.
               
              My
                  first patch change flush gpu tlb to sync mode,
                
              means
                  the one thread flush the gpu tlb twice, so my first
                  patch passed the rocblas benchmark.
            
          
            I will have to reject such patches, you need to find the
            underlying problem and not mitigate the symptoms.

            
              I
                  already submit an email to firmware team to ask why
                  the register will return wrong value.
               
              But if
                  the firmware team not able to solve this issue, or
                  need a long time to solve this issue,
              I will
                  submit the patch like below to do the workaround.
            
          
            Well that basically means a complete stop for any
            deliverable.

            
            The driver stack simply won't work correctly when register
            reads return random values like that.

            
            Regards,

            Christian.

            
              Thanks,
              Chong.
               
               
                  From: Li,
                      Chong(Alan)
                      

                      Sent: Friday, October 25, 2024 2:46 PM

                      To: Koenig, Christian <Christian.Koenig@xxxxxxx>;
                      Andjelkovic, Dejan
                      <Dejan.Andjelkovic@xxxxxxx>

                      Cc: cao, lin <lin.cao@xxxxxxx>;
                      Yin, ZhenGuo (Chris)
                      <ZhenGuo.Yin@xxxxxxx>;
                      Zhang, Tiantian (Celine)
                      <Tiantian.Zhang@xxxxxxx>;
                      
                        amd-gfx@xxxxxxxxxxxxxxxxxxxxx

                      Subject: RE: [PATCH] drm/amd/amdgpu: change
                      the flush gpu tlb mode to sync mode.
                
              
              Hi,
                  Christian.
               
              The
                  size of log file so large, can’t paste in the Email.
               
              I copy
                  the log file in directory “\\ark\incoming\chong\log”, the
                  log file name is “kern.log”.
               
              Can
                  you access this directory ?
               
               
              Thanks,
              Chong.
               
               
                  From:
                      Koenig, Christian <Christian.Koenig@xxxxxxx>
                      

                      Sent: Thursday, October 24, 2024 7:22 PM

                      To: Li, Chong(Alan) <Chong.Li@xxxxxxx>;
                      Andjelkovic, Dejan <Dejan.Andjelkovic@xxxxxxx>

                      Cc: cao, lin <lin.cao@xxxxxxx>;
                      Yin, ZhenGuo (Chris) <ZhenGuo.Yin@xxxxxxx>;
                      Zhang, Tiantian (Celine) <Tiantian.Zhang@xxxxxxx>;
                      Raina, Yera <Yera.Raina@xxxxxxx>

                      Subject: Re: [PATCH] drm/amd/amdgpu: change
                      the flush gpu tlb mode to sync mode.
                
              
              Do you
                have the full log as text file? As image it's pretty
                much useless.

                
                Regards,

                Christian.
              
                Am 24.10.24 um 09:41 schrieb Li,
                  Chong(Alan):
              
              
                [AMD
                    Official Use Only - AMD Internal Distribution Only]
                 
                
                  Hi,
                    Christian.
                   
                  We can
                      see the dmesg log,
                  After
                      address “7ef90be00” already update the ptes, page
                      fault still happen.
                   
                   
                  Thanks,
                  Chong.
                   
                  
                      From:
                          Koenig, Christian
                        <Christian.Koenig@xxxxxxx>
                          

                          Sent: Wednesday, October 23, 2024 5:26
                          PM

                          To: Li, Chong(Alan) <Chong.Li@xxxxxxx>;
                          Andjelkovic, Dejan
                        <Dejan.Andjelkovic@xxxxxxx>

                          Cc: cao, lin <lin.cao@xxxxxxx>;
                          Yin, ZhenGuo (Chris)
                        <ZhenGuo.Yin@xxxxxxx>;
                          Zhang, Tiantian (Celine)
                        <Tiantian.Zhang@xxxxxxx>;
                          Raina, Yera
                        <Yera.Raina@xxxxxxx>

                          Subject: Re: [PATCH] drm/amd/amdgpu:
                          change the flush gpu tlb mode to sync mode.
                    
                  
                  Hi
                    Chong,

                    
                    oh that could indeed be.

                    
                    I suggest to add a trace point for the page fault so
                    that we can guarantee that we use the same time
                    basis for both events.

                    
                    That should make it trivial to compare them.

                    
                    Regards,

                    Christian.
                  
                    Am 23.10.24 um 10:17 schrieb
                      Li, Chong(Alan):
                  
                  
                    [AMD
                        Official Use Only - AMD Internal Distribution
                        Only]
                     
                    
                      Hi,
                          Christian.
                       
                      I add a log in kernel,
                            and prove the timestamp in tracing log is
                            slower than dmesg log,
                          
                      so we can’t give a
                            conclusion that the issue in rocm.
                       
                       
                      ------------------------
                          the information I sync with Andjelkovic, Dejan
                          ----------------------------------------
                      dmesg
                          shows that the page fault happens address
                          “0x000072e5f4401000” at time “6587.772178”,
                       
                      
                      tracing
                          log shows that the function
                          “amdgpu_vm_update_ptes” be called at time
                          “6587.790869”,
                      
                      ------------------------
                          the information I sync with Andjelkovic, Dejan
                          ----------------------------------------
                       
                       
                      From
                          the log time stamp, you give a conclusion that
                          “The test
                          tries to access memory before it is probably
                          mapped and that is provable by looking into
                          the tracelogs.”.
                       
                      But
                          after I review the code, the function
                          “amdgpu_vm_ptes_update” be called in function
                          “svm_range_set_attr”,
                        
                       
                      So,
                          after this log in above dmesg print “[
                          6587.772136] amdgpu: pasid 0x8002 svms
                          0x000000008b03ff39 [0x72e5f4400 0x72e5fc3ff]
                          done, r=0”,
                        
                      the
                          function “svm_range_set_attr” will leave, in
                          that time “amdgpu_vm_ptes_update” is already
                          be called, the timestamp is not reasonable.
                       
                      I
                          think maybe the timestamp in tracing log has
                          some delay, and I add a line of log in kernel
                          to verify my guess,
                       
                      
                      The
                          below is the result:
                      tracing
                          log shows the address “ffffffc00” at time
                          “227.298607”,
                      dmesg
                          log print the address “ffffffc00” at time
                          “226.756137”.
                       
                       
                      traing
                          log:
                      
                       
                      dmesg
                          log:
                      
                       
                      Thanks,
                      Chong.
                       
                      
                          From: Li,
                              Chong(Alan)
                              

                              Sent: Monday, October 21, 2024 6:38
                              PM

                              To: Koenig, Christian <Christian.Koenig@xxxxxxx>;
                              Raina, Yera
                            <Yera.Raina@xxxxxxx>;
                              Andjelkovic, Dejan
                            <Dejan.Andjelkovic@xxxxxxx>

                              Cc: cao, lin <lin.cao@xxxxxxx>;
                              Yin, ZhenGuo (Chris)
                            <ZhenGuo.Yin@xxxxxxx>;
                              Zhang, Tiantian (Celine)
                            <Tiantian.Zhang@xxxxxxx>

                              Subject: RE: [PATCH]
                              drm/amd/amdgpu: change the flush gpu tlb
                              mode to sync mode.
                        
                      
                      Hi,
                        Christian.
                      Thanks for your reply,
                      And do you have any advice 
                          about this issue?
                       
                       
                      Hi,
                          Raina, Year.

                          Share I assign this ticket SWDEV-459983 to
                          rocm team?
                       
                       
                      Thanks,
                      Chong.
                       
                      
                          From:
                              Koenig, Christian <Christian.Koenig@xxxxxxx>
                              

                              Sent: Monday, October 21, 2024 6:08
                              PM

                              To: Li, Chong(Alan) <Chong.Li@xxxxxxx>;
                              Raina, Yera <Yera.Raina@xxxxxxx>

                              Cc: cao, lin <lin.cao@xxxxxxx>;
                            amd-gfx@xxxxxxxxxxxxxxxxxxxxx

                              Subject: Re: [PATCH]
                              drm/amd/amdgpu: change the flush gpu tlb
                              mode to sync mode.
                        
                      
                      Hi
                        Chong,

                        
                        Andjelkovic just shared a bunch of traces from
                        rocm on teams with me which I analyzed.

                        
                        When you know what you look for it's actually
                        pretty obvious what's going on. Just look at the
                        timestamp of the fault and compare that with the
                        timestamp of the operation mapping something at
                        the given address.

                        
                        When mapping an address happens only after
                        accessing an address then there is clearly
                        something wrong in the code which coordinates
                        this and that is the ROCm stress test tool in
                        this case.

                        
                        Regards,

                        Christian.
                      
                        Am 21.10.24 um 11:02
                          schrieb Li, Chong(Alan):
                      
                      
                        [AMD
                            Official Use Only - AMD Internal
                            Distribution Only]
                         
                        
                          Hi,
                              Christian, 
                               Raina,
                              Yera.
                           
                          If
                              this issue in rocm, I need assign my
                              ticket
                            SWDEV-459983 to
                              rocm team.
                           
                          Is
                              there anything to share with the rocm pm?
                          Such
                              as the Email or chat history or the ticket
                              you talk with
                            Andjelkovic.
                           
                          Thanks,
                          Chong.
                           
                          
                              From:
                                  Koenig, Christian
                                <Christian.Koenig@xxxxxxx>
                                  

                                  Sent: Monday, October 21, 2024
                                  4:00 PM

                                  To: Li, Chong(Alan) <Chong.Li@xxxxxxx>;
                                amd-gfx@xxxxxxxxxxxxxxxxxxxxx

                                  Cc: cao, lin <lin.cao@xxxxxxx>

                                  Subject: Re: [PATCH]
                                  drm/amd/amdgpu: change the flush gpu
                                  tlb mode to sync mode.
                            
                          
                          Am 21.10.24 um
                            07:56 schrieb Chong Li:

                            
                            change the gpu tlb flush mode to sync mode to
                            solve the issue in the rocm stress test.
                          
                          
                            And again complete NAK to this.

                            
                            I've already proven together with Andjelkovic that the
                              problem is that the rocm stress test is
                              broken.

                            
                            The test tries to
                              access memory before it is probably mapped
                              and that is provable by looking into the
                              tracelogs.

                            
                            Regards,

                            Christian. 

                            
                            Signed-off-by: Chong Li <chongli2@xxxxxxx>
                            ---
                             drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 4 ++--
                             1 file changed, 2 insertions(+), 2 deletions(-)
                             
                            diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
                            index 51cddfa3f1e8..4d9ff7b31618 100644
                            --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
                            +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
                            @@ -98,7 +98,6 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm
                              f->adev = adev;
                              f->dependency = *fence;
                              f->pasid = vm->pasid;
                            - INIT_WORK(&f->work, amdgpu_tlb_fence_work);
                              spin_lock_init(&f->lock);
                             
                              dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
                            @@ -106,7 +105,8 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm
                             
                              /* TODO: We probably need a separate wq here */
                              dma_fence_get(&f->base);
                            - schedule_work(&f->work);
                             
                              *fence = &f->base;
                            +
                            + amdgpu_tlb_fence_work(&f->work);
                             }