v2: Add a comment explaining the dangers of directly accessing the DFT register (Daniel) Signed-off-by: Ben Widawsky <ben@xxxxxxxxxxxx> --- tools/Makefile.am | 6 ++- tools/intel_l3_parity.c | 46 ++++++++++++++++-- tools/intel_l3_parity.h | 31 ++++++++++++ tools/intel_l3_udev_listener.c | 108 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 tools/intel_l3_parity.h create mode 100644 tools/intel_l3_udev_listener.c diff --git a/tools/Makefile.am b/tools/Makefile.am index 47bd5b3..19810cf 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -39,7 +39,7 @@ dist_bin_SCRIPTS = intel_gpu_abrt AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) -LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) +LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS) intel_dump_decode_SOURCES = \ intel_dump_decode.c @@ -50,3 +50,7 @@ intel_error_decode_SOURCES = \ intel_bios_reader_SOURCES = \ intel_bios_reader.c \ intel_bios.h + +intel_l3_parity_SOURCES = \ + intel_l3_parity.c \ + intel_l3_udev_listener.c diff --git a/tools/intel_l3_parity.c b/tools/intel_l3_parity.c index d2ad3c9..ead8fb5 100644 --- a/tools/intel_l3_parity.c +++ b/tools/intel_l3_parity.c @@ -37,6 +37,14 @@ #include "intel_chipset.h" #include "intel_gpu_tools.h" #include "drmtest.h" +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#if HAVE_UDEV +#include <libudev.h> +#include <syslog.h> +#endif +#include "intel_l3_parity.h" static unsigned int devid; /* L3 size is always a function of banks. The number of banks cannot be @@ -157,7 +165,8 @@ static void usage(const char *name) " -r, --row=[row] The row to act upon (default 0)\n" " -b, --bank=[bank] The bank to act upon (default 0)\n" " -s, --subbank=[subbank] The subbank to act upon (default 0)\n" - " -w, --slice=[slice] Which slice to act on (default: -1 [all])" + " -w, --slice=[slice] Which slice to act on (default: -1 [all])\n" + " , --daemon Run the listener (-L) as a daemon\n" " ACTIONS (only 1 may be specified at a time):\n" " -h, --help Display this help\n" " -H, --hw-info Display the current L3 properties\n" @@ -166,7 +175,8 @@ static void usage(const char *name) " -e, --enable Enable row, bank, subbank (undo -d)\n" " -d, --disable=<row,bank,subbank> Disable row, bank, subbank (inline arguments are deprecated. Please use -r, -b, -s instead\n" " -i, --inject [HSW only] Cause hardware to inject a row errors\n" - " -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n", + " -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n" + " -L, --listen Listen for uevent errors\n", name); } @@ -179,6 +189,7 @@ int main(int argc, char *argv[]) int fd[REAL_MAX_SLICES] = {0}, ret, i; int action = '0'; int drm_fd = drm_open_any(); + int daemonize = 0; devid = intel_get_drm_devid(drm_fd); if (intel_gen(devid) < 7 || IS_VALLEYVIEW(devid)) @@ -202,11 +213,18 @@ int main(int argc, char *argv[]) assert(lseek(fd[i], 0, SEEK_SET) == 0); } + /* NB: It is potentially unsafe to read this register if the kernel is + * actively using this register range, or we're running multiple + * instances of this tool. Since neither of those cases should occur + * (and the tool should be root only) we can safely ignore this for + * now. Just be aware of this if for some reason a hang is reported + * when using this tool. + */ dft = intel_register_read(0xb038); while (1) { int c, option_index = 0; - static struct option long_options[] = { + struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "list", no_argument, 0, 'l' }, { "clear-all", no_argument, 0, 'a' }, @@ -215,18 +233,23 @@ int main(int argc, char *argv[]) { "inject", no_argument, 0, 'i' }, { "uninject", no_argument, 0, 'u' }, { "hw-info", no_argument, 0, 'H' }, + { "listen", no_argument, 0, 'L' }, { "row", required_argument, 0, 'r' }, { "bank", required_argument, 0, 'b' }, { "subbank", required_argument, 0, 's' }, { "slice", required_argument, 0, 'w' }, + { "daemon", no_argument, &daemonize, 1 }, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "hHr:b:s:w:aled::iu", long_options, + c = getopt_long(argc, argv, "hHr:b:s:w:aled::iuL", long_options, &option_index); if (c == -1) break; + if (c == 0) + continue; + switch (c) { case '?': case 'h': @@ -274,6 +297,7 @@ int main(int argc, char *argv[]) case 'a': case 'l': case 'e': + case 'L': if (action != '0') { fprintf(stderr, "Only one action may be specified\n"); exit(EXIT_FAILURE); @@ -299,6 +323,20 @@ int main(int argc, char *argv[]) printf("warning: overwriting existing injections. This is very dangerous.\n"); } + /* Daemon doesn't work like the other commands */ + if (action == 'L') { + struct l3_parity par; + struct l3_location loc; + if (daemonize) { + assert(daemon(0, 0) == 0); + openlog(argv[0], LOG_CONS | LOG_PID, LOG_USER); + } + memset(&par, 0, sizeof(par)); + assert(l3_uevent_setup(&par) == 0); + assert(l3_listen(&par, daemonize == 1, &loc) == 0); + exit(EXIT_SUCCESS); + } + if (action == 'l') decode_dft(dft); diff --git a/tools/intel_l3_parity.h b/tools/intel_l3_parity.h new file mode 100644 index 0000000..65697c4 --- /dev/null +++ b/tools/intel_l3_parity.h @@ -0,0 +1,31 @@ +#ifndef INTEL_L3_PARITY_H_ +#define INTEL_L3_PARITY_H_ + +#include <stdint.h> +#include <stdbool.h> + +struct l3_parity { + struct udev *udev; + struct udev_monitor *uevent_monitor; + int fd; + fd_set fdset; +}; + +struct l3_location { + uint8_t slice; + uint16_t row; + uint8_t bank; + uint8_t subbank; +}; + +#if HAVE_UDEV +int l3_uevent_setup(struct l3_parity *par); +/* Listens (blocks) for an l3 parity event. Returns the location of the error. */ +int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc); +#define l3_uevent_teardown(par) {} +#else +#define l3_uevent_setup(par, daemon, loc) -1 +#define l3_listen(par) -1 +#endif + +#endif diff --git a/tools/intel_l3_udev_listener.c b/tools/intel_l3_udev_listener.c new file mode 100644 index 0000000..c50820c --- /dev/null +++ b/tools/intel_l3_udev_listener.c @@ -0,0 +1,108 @@ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#if HAVE_UDEV +#include <libudev.h> +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <syslog.h> +#include "i915_drm.h" +#include "intel_l3_parity.h" + +#ifndef I915_L3_PARITY_UEVENT +#define I915_L3_PARITY_UEVENT "L3_PARITY_ERROR" +#endif + +int l3_uevent_setup(struct l3_parity *par) +{ + struct udev *udev; + struct udev_monitor *uevent_monitor; + fd_set fdset; + int fd, ret = -1; + + udev = udev_new(); + if (!udev) { + return -1; + } + + uevent_monitor = udev_monitor_new_from_netlink(udev, "udev"); + if (!uevent_monitor) + goto err_out; + + ret = udev_monitor_filter_add_match_subsystem_devtype(uevent_monitor, "drm", "drm_minor"); + if (ret < 0) + goto err_out; + + ret = udev_monitor_enable_receiving(uevent_monitor); + if (ret < 0) + goto err_out; + + fd = udev_monitor_get_fd(uevent_monitor); + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + + par->udev = udev; + par->fd = fd; + par->fdset = fdset; + par->uevent_monitor = uevent_monitor; + return 0; + +err_out: + udev_unref(udev); + return ret; +} + +int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc) +{ + struct udev_device *udev_dev; + const char *parity_status; + char *err_msg; + int ret; + +again: + ret = select(par->fd + 1, &par->fdset, NULL, NULL, NULL); + /* Number of bits set is returned, must be >= 1 */ + if (ret <= 0) { + return ret; + } + + assert(FD_ISSET(par->fd, &par->fdset)); + + udev_dev = udev_monitor_receive_device(par->uevent_monitor); + if (!udev_dev) + return -1; + + parity_status = udev_device_get_property_value(udev_dev, I915_L3_PARITY_UEVENT); + if (strncmp(parity_status, "1", 1)) + goto again; + + loc->slice = atoi(udev_device_get_property_value(udev_dev, "SLICE")); + loc->row = atoi(udev_device_get_property_value(udev_dev, "ROW")); + loc->bank = atoi(udev_device_get_property_value(udev_dev, "BANK")); + loc->subbank = atoi(udev_device_get_property_value(udev_dev, "SUBBANK")); + + udev_device_unref(udev_dev); + + asprintf(&err_msg, "Parity error detected on: %d,%d,%d,%d. " + "Try to run intel_l3_parity -r %d -b %d -s %d -w %d -d", + loc->slice, loc->row, loc->bank, loc->subbank, + loc->row, loc->bank, loc->subbank, loc->slice); + if (daemon) { + syslog(LOG_INFO, "%s\n", err_msg); + goto again; + } + + fprintf(stderr, "%s\n", err_msg); + + free(err_msg); + + return 0; +} +#endif -- 1.8.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx