During stress testing we found that with some Vulkan applications the fence information displayed in the recently added fdinfo was not properly calculated, two issues were discovered: (1) A missing dma_put_fence on the loop that calculates the usage ratios when the fence is being ignored. (2) The approximation for the ratio calculation is not accurate when accounting for non-active contexts. The fix is to ignore those context if they have activity ratios lower than 0.01% Attached is also a script demonstrating how the fdinfo can be used to monitor gpu usage on running processes. #!/usr/bin/env python3 # # Copyright (C) 2021 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of # this software and associated documentation files (the "Software"), to # deal in # the Software without restriction, including without limitation the # rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of # the Software, and to permit persons to whom the Software is furnished # to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be # included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, # WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR # IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # from tokenize import tokenize import sys import os import pwd total_mem = dict() total_usage = dict() def can_access(path): return os.access(path + "/fdinfo", os.X_OK) def calc_perc(entry, metric): if not metric in entry: return 0.0 if (type(entry[metric]) == list) : return sum(entry[metric]) else : return entry[metric] def process_pid(file): stat = dict() pasids = [] for fd in os.scandir(file.path + "/fdinfo"): entry = {} with open(fd) as f: for line in f: entries = line.strip().split() if (entries[0] == "pdev:") : entry["pdev"] = entries[1] elif (entries[0] == "pasid:") : entry["pasid"] = entries[1] elif (entries[0] == "vram") : entry["mem"] = int(entries[2]) elif ("gfx" in entries[0]) : if not "gfx" in entry : entry["gfx"] = [0,0,0,0,0,0,0,0] entry["gfx"][int(entries[0].lstrip("gfx").rstrip(":"))] = float(entries[1].rstrip("%")) elif ("dma" in entries[0]) : if not "dma" in entry : entry["dma"] = [0,0,0,0,0,0,0,0] entry["dma"][int(entries[0].lstrip("dma").rstrip(":"))] = float(entries[1].rstrip("%")) elif ("dec" in entries[0]) : if not "dec" in entry : entry["dec"] = [0,0,0,0,0,0,0,0] entry["dec"][int(entries[0].lstrip("dec").rstrip(":"))] = float(entries[1].rstrip("%")) elif ("enc" in entries[0]) : if not "enc" in entry : entry["enc"] = [0,0,0,0,0,0,0,0] entry["enc"][int(entries[0].lstrip("enc").rstrip(":"))] = float(entries[1].rstrip("%")) elif ("compute" in entries[0]) : if not "compute" in entry : entry["compute"] = [0,0,0,0,0,0,0,0] entry["compute"][int(entries[0].lstrip("compute").rstrip(":"))] = float(entries[1].rstrip("%")) if not "pdev" in entry: continue if not "pasid" in entry : continue if (entry["pdev"], entry["pasid"]) in pasids: continue pasids.append((entry["pdev"], entry["pasid"])) pdev = entry["pdev"] if not pdev in stat: stat[pdev] = dict() if "mem" in entry : if "mem" in stat[pdev] : stat[pdev]["mem"] = stat[pdev]["mem"] + entry["mem"]; else : stat[pdev]["mem"] = entry["mem"] if "gfx" in entry : if "gfx" in stat[pdev] : stat[pdev]["gfx"] = [a + b for a, b in zip(stat[pdev]["gfx"], entry["gfx"])] else : stat[pdev]["gfx"] = entry["gfx"] if "enc" in entry : if "enc" in stat[pdev] : stat[pdev]["enc"] = [a + b for a, b in zip(stat[pdev]["enc"], entry["enc"])] else : stat[pdev]["enc"] = entry["enc"] if "dec" in entry : if "dec" in stat[pdev] : stat[pdev]["dec"] = [a + b for a, b in zip(stat[pdev]["dec"], entry["dec"])] else : stat[pdev]["dec"] = entry["dec"] if "dma" in entry : if "dma" in stat[pdev] : stat[pdev]["dma"] = [a + b for a, b in zip(stat[pdev]["dma"], entry["dma"])] else : stat[pdev]["dma"] = entry["dma"] if "compute" in entry : if "compute" in stat[pdev] : stat[pdev]["compute"] = [a + b for a, b in zip(stat[pdev]["compute"], entry["compute"])] else : stat[pdev]["compute"] = entry["compute"] for gpu in stat: stat[gpu]["pid"] = file.name with open(file.path + "/comm") as f: stat[gpu]["name"] = f.readline().strip() if stat: for s in stat: if not s in total_mem: total_mem[s] = int(stat[s]["mem"]) else: total_mem[s] = total_mem[s] + int(stat[s]["mem"]) if not s in total_usage: total_usage[s] = dict() for key in stat[s]: if key == "mem": continue if key == "name": continue if key == "pid": continue total = calc_perc(stat[s], key) if not key in total_usage[s]: total_usage[s][key] = total else: total_usage[s][key] = total + total_usage[s][key] # the /proc/PID is owned by process creator proc_stat_file = os.stat("/proc/%d" % int(stat[s]['pid'])) # get UID via stat call uid = proc_stat_file.st_uid # look up the username from uid username = pwd.getpwuid(uid)[0] print("| {0:5s} | {1:16s} | {9:10s} | {2} | {3:7d} KiB | {4:6.2f} {5:6.2f} {6:6.2f} {7:6.2f} {8:6.2f} |" .format(stat[s]["pid"].ljust(5), stat[s]["name"].ljust(16), s, stat[s]["mem"], calc_perc(stat[s], 'gfx'), calc_perc(stat[s], 'compute'), calc_perc(stat[s], 'dma'), calc_perc(stat[s], 'enc'), calc_perc(stat[s], 'dec'), username )) print("+-------+------------------+------------+--------------+-------------+-----------------------------------------+") path = "/proc/" print("+=======+==================+============+==============+=============+=========================================+") print("| pid | name | user | gpu bdf | fb usage | ring usage (%) |") print("| | | | | | gfx comp dma enc dec |") print("+=======+==================+============+==============+=============+=========================================+") for file in os.scandir(path): if (file.is_dir() and file.name.isnumeric()) : if (can_access(file.path)): process_pid(file) for gpu in total_mem: print("| TOTAL:| {0} | {1:7d} KiB | {2:6.2f} {3:6.2f} {4:6.2f} {5:6.2f} {6:6.2f} |".format(gpu, total_mem[gpu], calc_perc(total_usage[gpu], 'gfx'), calc_perc(total_usage[gpu], 'compute'), calc_perc(total_usage[gpu], 'dma'), calc_perc(total_usage[gpu], 'enc'), calc_perc(total_usage[gpu], 'dec'), )) print("+=======+==================+============+==============+=============+=====================+++=================+") _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx