Add a (rough) python script that can parse the output of: # trace-cmd -e xfs_buf_*lock*' <other tracepoints> to identify xfs_buf deadlocks between XFS threads. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- tools/xfsbuflock.py | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100755 tools/xfsbuflock.py diff --git a/tools/xfsbuflock.py b/tools/xfsbuflock.py new file mode 100755 index 0000000..f307461 --- /dev/null +++ b/tools/xfsbuflock.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 + +# Read ftrace input, looking for XFS buffer deadlocks. +# +# Copyright (C) 2016 Oracle. All Rights Reserved. +# +# Author: Darrick J. Wong <darrick.wong@xxxxxxxxxx> +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. +# +# Rough guide to using this script: +# Collect ftrace data from a deadlock: +# +# # trace-cmd record -e 'xfs_buf_*lock*' <other traces> & +# <run command, hang system>^Z +# # killall -INT trace-cmd +# <wait for trace-cmd to spit out trace.dat> +# +# Now analyze the captured trace data: +# +# # trace-cmd report | xfsbuflock.py +# === fsx-14956 === +# <trace data> +# 3732.005575: xfs_buf_trylock_fail: dev 8:16 bno 0x1 nblks 0x1 hold 4 \ +# pincount 1 lock 0 flags DONE|KMEM caller 0xc009af36s +# Locked buffers: +# dev 8:16 bno 0x64c371 nblks 0x1 lock 1 owner fsx-14956@3732.005567 +# waiting: fsx-14954 +# dev 8:16 bno 0x64c380 nblks 0x8 lock 1 owner fsx-14956@3732.005571 +# dev 8:16 bno 0x64c378 nblks 0x8 lock 1 owner fsx-14956@3732.005570 +# === fsx-14954 === +# <trace data> +# 3732.005592: xfs_buf_trylock_fail: dev 8:16 bno 0x64c371 nblks 0x1 hold 4 \ +# pincount 1 lock 0 flags ASYNC|DONE|KMEM caller 0xc009af36s +# Locked buffers: +# dev 8:16 bno 0x8 nblks 0x8 lock 1 owner fsx-14954@3732.005583 +# dev 8:16 bno 0x1 nblks 0x1 lock 1 owner fsx-14954@3732.005574 +# waiting: fsx-14956 +# waiting: fsx-14957 +# waiting: fsx-14958 +# dev 8:16 bno 0x10 nblks 0x8 lock 1 owner fsx-14954@3732.005585 +# +# As you can see, fsx-14596 is locking AGFs in violation of the locking +# order rules. + +import sys +import fileinput +from collections import namedtuple + +NR_BACKTRACE = 50 + +class Process: + def __init__(self, pid): + self.pid = pid; + self.bufs = set() + self.locked_bufs = set() + self.backtrace = [] + + def dump(self): + print('=== %s ===' % self.pid) + for bt in self.backtrace: + print('%f: %s' % (bt.time, bt.descr)) + print('Locked buffers:') + for buf in self.locked_bufs: + buf.dump() + +class Buffer: + def __init__(self, dev, bno, blen): + self.dev = dev + self.bno = int(bno, 0) + self.blen = int(blen, 0) + self.locked = False + self.locktime = None + self.owner = None + self.waiters = set() + + def trylock(self, process, time): + self.lockdone(process, time) + + def lockdone(self, process, time): + if self.locked: + print('Buffer already locked on line %d?!' % nr) + # process.dump() + # self.dump() + # assert False + if process in self.waiters: + self.waiters.remove(process) + self.locked = True + self.owner = process + self.locktime = time + process.locked_bufs.add(self) + process.bufs.add(self) + locked_buffers.add(self) + + def waitlock(self, process): + self.waiters.add(process) + + def unlock(self): + self.locked = False + if self in locked_buffers: + locked_buffers.remove(self) + if self.owner is not None and \ + self in self.owner.locked_bufs: + self.owner.locked_bufs.remove(self) + + def dump(self): + if self.owner is not None: + pid = '%s@%f' % (self.owner.pid, self.locktime) + else: + pid = '' + print('dev %s bno 0x%x nblks 0x%x lock %d owner %s' % \ + (self.dev, self.bno, self.blen, self.locked, \ + pid)) + for proc in self.waiters: + print(' waiting: %s' % proc.pid) + +Event = namedtuple('Event', 'time descr') + +# Read ftrace input, looking for events and for buffer lock info +processes = {} +buffers = {} +locked_buffers = set() + +def getbuf(toks): + if int(toks[7], 0) == 18446744073709551615: + return None + bufkey = ' '.join(toks[4:10]) + if bufkey in buffers: + return buffers[bufkey] + buf = Buffer(toks[5], toks[7], toks[9]) + buffers[bufkey] = buf + return buf + +nr = 0 +for line in fileinput.input(): + nr += 1 + toks = line.split() + if len(toks) < 4: + continue + pid = toks[0] + time = float(toks[2][:-1]) + fn = toks[3][:-1] + + if pid in processes: + proc = processes[pid] + else: + proc = Process(pid) + processes[pid] = proc + + if fn == 'xfs_buf_unlock' or fn == 'xfs_buf_item_unlock_stale': + buf = getbuf(toks) + if buf is not None: + buf.unlock() + elif fn == 'xfs_buf_lock_done': + buf = getbuf(toks) + if buf is not None: + buf.lockdone(proc, time) + elif fn == 'xfs_buf_lock': + buf = getbuf(toks) + if buf is not None: + buf.waitlock(proc) + elif fn == 'xfs_buf_trylock': + buf = getbuf(toks) + if buf is not None: + buf.trylock(proc, time) + elif fn == 'xfs_buf_item_unlock': + pass + else: + e = Event(time, ' '.join(toks[3:])) + proc.backtrace.append(e) + if len(proc.backtrace) > NR_BACKTRACE: + proc.backtrace.pop(0) + +deadlocked = set() +for buf in locked_buffers: + deadlocked.add(buf.owner) + +for proc in deadlocked: + proc.dump() + +sys.exit(0) + +for key in buffers: + buf = buffers[key] + if buf.locked: + print('dev %s bno 0x%x len 0x%x owner %s' % (buf.dev, buf.bno, buf.blen, buf.owner.pid)) + else: + print('dev %s bno 0x%x len 0x%x' % (buf.dev, buf.bno, buf.blen)) + +sys.exit(0) + +for pid in processes: + proc = processes[pid] _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs