From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> This (bcc) patch modifies bcc so that we can override some function return values. We then create a new python script containing custom logic to decide where a file's data goes (rtdev or datadev) and inject the compiled eBPF code into the kernel. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- src/cc/compat/linux/bpf.h | 7 ++ src/cc/compat/linux/virtual_bpf.h | 3 + src/cc/export/helpers.h | 2 + tools/xfs_rt.py | 130 +++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100755 tools/xfs_rt.py diff --git a/src/cc/compat/linux/bpf.h b/src/cc/compat/linux/bpf.h index f896897..5a3ec0b 100644 --- a/src/cc/compat/linux/bpf.h +++ b/src/cc/compat/linux/bpf.h @@ -677,6 +677,10 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code + * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -736,7 +740,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index a2bcf07..7fbc365 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -735,7 +735,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 2b64ee8..571191e 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -204,6 +204,8 @@ static int (*bpf_probe_read)(void *dst, u64 size, const void *unsafe_ptr) = (void *) BPF_FUNC_probe_read; static u64 (*bpf_ktime_get_ns)(void) = (void *) BPF_FUNC_ktime_get_ns; +static void (*bpf_override_return)(void *ctx, unsigned long rc) = + (void *) BPF_FUNC_override_return; static u32 (*bpf_get_prandom_u32)(void) = (void *) BPF_FUNC_get_prandom_u32; static int (*bpf_trace_printk_)(const char *fmt, u64 fmt_size, ...) = diff --git a/tools/xfs_rt.py b/tools/xfs_rt.py new file mode 100755 index 0000000..b44fa14 --- /dev/null +++ b/tools/xfs_rt.py @@ -0,0 +1,130 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# xfs_rt Decide on file data block device placement via custom algorithm. +# Uses XFS hacks to inject... stuff. +# +# Copyright 2017 Oracle, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") + +from __future__ import print_function +from bcc import BPF +import argparse +from time import sleep, strftime +import ctypes as ct + +# arguments +examples = """examples: + ./xfs_rt +""" +parser = argparse.ArgumentParser( + description="Custom placement of data file blocks on XFS", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +args = parser.parse_args() +debug = 0 + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <linux/fs.h> + +struct xfs_fsop_geom { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +}; + +/* Output for XFS_FS_COUNTS */ +struct xfs_fsop_counts { + __u64 freedata; /* free data section blocks */ + __u64 freertx; /* free rt extents */ + __u64 freeino; /* free inodes */ + __u64 allocino; /* total allocated inodes */ +}; + +typedef unsigned long long xfs_ino_t; + +int +xfs_hack_filter_iflags_begin( + struct pt_regs *ctx, + struct xfs_fsop_geom *geo, + struct xfs_fsop_counts *stats, + xfs_ino_t ino, + loff_t offset, + loff_t length, + uint xflags) +{ + bool use_rt = false; + +#if 0 + bpf_trace_printk("B: off=%llu len=%llu xflags=0x%x\\n", offset, length, xflags); + bpf_trace_printk("B: dblocks=%llu rblocks=%llu\\n", geo->datablocks, geo->rtblocks); + bpf_trace_printk("B: dfree=%llu rfree=%llu\\n", stats->freedata, stats->freertx); +#endif + + /* + * If the first allocation request is for >64k then we assume this + * is a "large" file and push it to the rt device. + */ + if (length >= 65536) + use_rt = true; + + /* + * Redirect files to the 'other' device if the chosen one is more + * than 80% full. + */ + if (use_rt && stats->freertx < geo->rtblocks / 5) + use_rt = false; + else if (!use_rt && stats->freedata < geo->datablocks / 5) + use_rt = true; + + if (use_rt) + xflags |= FS_XFLAG_REALTIME; + else + xflags &= ~FS_XFLAG_REALTIME; + + bpf_override_return(ctx, xflags); + return 0; +} + +""" +if debug: + print(bpf_text) + +# initialize BPF +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="xfs_hack_filter_iflags", fn_name="xfs_hack_filter_iflags_begin") + +print("BPF HACKING XFS... Hit Ctrl-C to end.") + +# output +exiting = 0 +while (1): + try: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + if exiting: + exit() -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html