Re: Interactivity regression since v3.11 in mm/vmscan.c

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu 05-06-14 09:00:10, Felipe Contreras wrote:
> On Thu, Jun 5, 2014 at 8:37 AM, Michal Hocko <mhocko@xxxxxxx> wrote:
> > On Thu 05-06-14 06:33:40, Felipe Contreras wrote:
> 
> >> For a while I've noticed that my machine bogs down in certain
> >> situations, usually while doing heavy I/O operations, it is not just the
> >> I/O operations, but everything, including the graphical interface, even
> >> the mouse pointer.
> >>
> >> As far as I can recall this did not happen in the past.
> >>
> >> I noticed this specially on certain operations, for example updating a
> >> a game on Steam (to an exteranl USB 3.0 device), or copying TV episodes
> >> to a USB memory stick (probably flash-based).
> >
> > We had a similar report for opensuse. The common part was that there was
> > an IO to a slow USB device going on.
> 
> Well, it's a USB 3.0 device, I can write at 250 MB/s, so it's not
> really that slow.
> 
> And in fact, when I read and write to and from the same USB 3.0
> device, I don't see the issue.
> 
> >> Then I went back to the latest stable version (v3.14.5), and commented
> >> out the line I think is causing the slow down:
> >>
> >>   if (nr_unqueued_dirty == nr_taken || nr_immediate)
> >>         congestion_wait(BLK_RW_ASYNC, HZ/10);
> >
> > Yes, I came to the same check. I didn't have any confirmation yet so
> > thanks for your confirmation. I've suggested to reduce this
> > congestion_wait only to kswapd:
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 32c661d66a45..ef6a1c0e788c 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1566,7 +1566,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >                  * implies that pages are cycling through the LRU faster than
> >                  * they are written so also forcibly stall.
> >                  */
> > -               if (nr_unqueued_dirty == nr_taken || nr_immediate)
> > +               if ((nr_unqueued_dirty == nr_taken || nr_immediate) && current_is_kswapd())
> >                         congestion_wait(BLK_RW_ASYNC, HZ/10);
> >         }
> 
> Unfortunately that doesn't fix the issue for me.

That is really interesting. So removing the test completely helps but
reducing it to kswapd doesn't. I would expect stalls coming from direct
reclaimers not the kswapd.

Mel has a nice systemtap script (attached) to watch for stalls. Maybe
you can give it a try?

-- 
Michal Hocko
SUSE Labs
#!/usr/bin/perl
# This script is a combined perl and systemtap script to collect information
# on a system stalling in writeback. Ordinarily, one would expect that all
# information be collected in a STAP script. Unfortunately, in practice the
# stack unwinder in systemtap may not work with a current kernel version,
# have trouble collecting all the data necessary or some other oddities.
# Hence this hack. A systemtap script is run and as it records interesting
# events, the remaining information is collected from the script. This means
# that the data is *never* exact but can be better than nothing and easier
# than a fully manual check
#
# Copyright Mel Gorman <mgorman@xxxxxxx> 2011

use File::Temp qw/mkstemp/;
use File::Find;
use FindBin qw($Bin);
use Getopt::Long;
use strict;

my @trace_functions = (
	# "get_request_wait" is now special cased unfortunately
	"wait_for_completion",
	"wait_on_page_bit",
	"wait_on_page_bit_killable",
	"try_to_free_pages"
	);

my @completion_functions=(
	"handle_mm_fault",
	"sys_select",
	"__wake_up",
	"wake_up_bit",
	"__alloc_pages_nodemask",
	"balance_pgdat",
	"kmem_cache_alloc");

my @trace_conditional = (
	"sync_page",
	"sync_buffer",
	"sleep_on_buffer",
	"try_to_compact_pages",
	"balance_dirty_pages_ratelimited_nr",
	"balance_dirty_pages",
	);

# Information on each stall is gathered and stored in a hash table for
# dumping later. Define some constants for the table lookup to avoid
# blinding headaches
use constant VMSTAT_AT_STALL       => 0;
use constant VMSTAT_AT_COMPLETE    => 1;
use constant BLOCKSTAT_AT_STALL    => 2;
use constant BLOCKSTAT_AT_COMPLETE => 3;
use constant PROCNAME              => 4;
use constant STACKTRACE            => 5;
use constant STALLFUNCTION         => 6;

use constant NR_WRITEBACK => 0;
use constant NR_DIRTY     => 2;
use constant VMSCAN_WRITE => 1;

sub usage() {
	print("In general, this script is not supported and that includes help.\n");
	exit(0);
}

# Option variables
my $opt_help;
my $opt_output;
my $opt_stapout;
my $opt_accurate_stall = 1;
my $opt_accurate_stack = 0;
GetOptions(
	'help|h'		=> \$opt_help,
	'output|o=s'		=> \$opt_output,
	'stapout|s=s'		=> \$opt_stapout,
	'accurate-stack|a'	=> \$opt_accurate_stack,
	'accurate-stall|a'	=> \$opt_accurate_stall,
);

usage() if $opt_help;
if ($opt_output) {
	open(OUTPUT, ">$opt_output") || die("Failed to open $opt_output for writing");
}
if ($opt_stapout) {
	open(OUTPUT, ">$opt_stapout") || die("Failed to open $opt_stapout for writing");
}

if ($opt_accurate_stack) {
	$opt_accurate_stall = 0;
}
if ($opt_accurate_stall) {
	$opt_accurate_stack = 0;
}

# Handle cleanup of temp files
my $stappid;
my ($handle, $stapscript) = mkstemp("/tmp/stapdXXXXX");
sub cleanup {
	if (defined($stappid)) {
		kill INT => $stappid;
	}
	if (defined($opt_output)) {
		close(OUTPUT);
	}
	unlink($stapscript);
}
sub sigint_handler {
	close(STAP);
	cleanup();
	exit(0);
}
$SIG{INT} = "sigint_handler";

# Build a list of stat files to read. Obviously this is not great if device
# hotplug occurs but that is not expected for the moment and this is lighter
# than running find every time
my @block_iostat_files;
sub d {
	my $file = $File::Find::name;
	return if $file eq "/sys/block";
	push(@block_iostat_files, "$file/stat");
}
find(\&d, ("/sys/block/"));

##
# Read the current stack of a given pid
sub read_stacktrace($) {
	open(STACK, "/proc/$_[0]/stack") || return "Stack unavailable";
	my $stack = do {
		local $/;
		<STACK>;
	};
	close(STACK);
	return $stack;
}

##
# Read information of relevant from /proc/vmstat
sub read_vmstat {
	if (!open(VMSTAT, "/proc/vmstat")) {
		cleanup();
		die("Failed to read /proc/vmstat");
	}

	my $vmstat;
	my ($key, $value);
	my @values;
	while (!eof(VMSTAT)) {
		$vmstat = <VMSTAT>;
		($key, $value) = split(/\s+/, $vmstat);
		chomp($value);

		if ($key eq "nr_writeback") {
			$values[NR_WRITEBACK] = $value;
		}
		if ($key eq "nr_dirty") {
			$values[NR_DIRTY] = $value;
		}
		if ($key eq "nr_vmscan_write") {
			$values[VMSCAN_WRITE] = $value;
		}
	}

	return \@values;
}

##
# Read information from all /sys/block stat files
sub read_blockstat($) {
	my $prefix = $_[0];
	my $stat;
	my $ret;
	
	foreach $stat (@block_iostat_files) {
		if (open(STAT, $stat)) {
			$ret .= sprintf "%s%20s %s", $prefix, $stat, <STAT>;
			close(STAT);
		}
	}
	return $ret;
}

##
# Record a line of output
sub log_output {
	if (defined($opt_output)) {
		print OUTPUT @_;
	}
	print @_;
}

sub log_printf {
	if (defined($opt_output)) {
		printf OUTPUT @_;
	}
	printf @_;
}

sub log_stap {
	if (defined($opt_stapout)) {
		print OUTPUT @_;
	}
}

# Crude as hell, do not really care
my %found_alts;
sub search_kallsyms {
	my @search_symbols = @_;

	# Read kernel symbols and add conditional trace functions if they exist
	open(KALLSYMS, "/proc/kallsyms") || die("Failed to open /proc/kallsyms");
	while (<KALLSYMS>) {
		my ($dummy, $dummy, $symbol) = split(/\s+/, $_);
		my $conditional;
		if ($symbol eq "get_request_wait" || $symbol eq "shrink_zone") {
			push(@trace_functions, $symbol);
			$found_alts{$symbol} = 1;
			next;
		}
		foreach $conditional (@search_symbols) {
			if ($symbol eq $conditional) {
				push(@trace_functions, $symbol);
				last;
			}
		}
	}
	close(KALLSYMS);
}
search_kallsyms(@trace_conditional);
if ($found_alts{"get_request_wait"} != 1) {
	push(@trace_functions, "get_request");
}

if ($found_alts{"shrink_zone"} != 1) {
	my @alt_shrinks = ("shrink_zones",
			"kswapd_shrink_zone",
			"__zone_reclaim",
			"balance_pgdat");
	search_kallsyms(@alt_shrinks);
}

# Extract the framework script and fill in the rest
open(SELF, "$0") || die("Failed to open running script");
while (<SELF>) {
	chomp($_);
	if ($_ ne "__END__") {
		next;
	}
	while (<SELF>) {
		print $handle $_;
	}
}
foreach(@trace_functions) {
	print $handle "probe kprobe.function(\"$_\")
{ 
	t=tid()
	name[t]=execname()
	stalled_at[t]=time()
	where[t]=\"$_\"
	delete stalled[t]
}";
}

if ($opt_accurate_stall) {
	# In an ideal world, we would always use a retprobe to catch exactly when
	# the function exited and get a stall time from it. Unfortunately, it mangles
	# the stack trace so we have the option of either accurately tracking stalls
	# or accurately tracking stacks
	foreach(@trace_functions) {
		print $handle "probe kprobe.function(\"$_\").return
{
	t=tid()

	if ([t] in stalled) {
		stall_time = time() - stalled_at[t]
		printf(\"C %d (%s) %d %s %s\\n\", t, name[t], stall_time, time_units, where[t])
	}

	delete stalled[t]
	delete name[t]
	delete stalled_at[t]
	delete where[t]
}"
	}
} else {
	# Alternatively, we try to catch when a stall completes by probing
	# commonly used functions and guessing that when they are called
	# that the operation completed
	foreach(@completion_functions) {
		print $handle "probe kprobe.function(\"$_\").return
{
	t=tid()

	if ([t] in stalled) {
		stall_time = time() - stalled_at[t]
		printf(\"C %d (%s) %d %s %s\\n\", t, name[t], stall_time, time_units, where[t])
	}

	delete stalled[t]
	delete name[t]
	delete stalled_at[t]
	delete where[t]
}";
	}
}

close($handle);

# Contact
$stappid = open(STAP, "stap $stapscript|");
if (!defined($stappid)) {
	die("Failed to execute stap script");
}

# Collect information until interrupted
my %stalled;
while (1) {
	if (eof(STAP)) {
		cleanup();
		die("Unexpected exit of STAP script");
	}

	my $input = <STAP>;
	log_stap($input);
	if ($input !~ /([CS]) ([0-9]*) \((.*)\) ([0-9]*) ms (.*)/) {
		cleanup();
		die("Failed to parse input from stap script\n");
	}

	my $event    = $1;
	my $pid      = $2;
	my $name     = $3;
	my $stalled  = $4;
	my $where    = $5;
	my $recursed = 0;
	
	# Check if we have recursively stalled. This is "impossible" but unless
	# we are using kretprobes, we cannot reliable catch when stalls complete
	if (defined($stalled{$pid}->{NAME}) && $event eq "S") {
		$recursed = 1;
		if ($opt_accurate_stall) {
			cleanup();
			print("Apparently recursing, missing kretprobes.\n");
			print("Process:  $pid ($name)\n");
			print("Stalled:  " . $stalled{$pid}->{STALLFUNCTION} . "\n");
			print($stalled{$pid}->{STACKTRACE});
			print("Stalling: $where\n");
			exit(-1);
		}
	}

	# Record information related to stalls.
	if ($event eq "C" || $recursed) {
		if ($name ne $stalled{$pid}->{NAME}) {
			cleanup();
			die("Processes are changing their identity.");
		}
		if ($where ne $stalled{$pid}->{STALLFUNCTION}) {
			$recursed = 1;
			if ($opt_accurate_stall) {
				cleanup();
				die("The stalling function teleported.");
			}
		}

		# Do not event pretend the stall time is accurate
		if ($recursed) {
			$stalled = -1;
		}

		$stalled{$pid}->{VMSTAT_AT_COMPLETE} = read_vmstat();
		$stalled{$pid}->{BLOCKSTAT_AT_COMPLETE} = read_blockstat("+");
		my $delta_writeback    = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_WRITEBACK] - $stalled{$pid}->{VMSTAT_AT_STALL}[NR_WRITEBACK];
		my $delta_dirty        = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_DIRTY]     - $stalled{$pid}->{VMSTAT_AT_STALL}[NR_DIRTY];
		my $delta_vmscan_write = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[VMSCAN_WRITE] - $stalled{$pid}->{VMSTAT_AT_STALL}[VMSCAN_WRITE];

		# Blind stab in the dark as to what is going on
		my $status;
		if ($where eq "balance_dirty_pages") {
			$status = "DirtyThrottled";
		} else {
			$status = "IO";
		}
		if ($delta_writeback < 0) {
			$status = "${status}_WritebackInProgress";
		}
		if ($delta_writeback > 0) {
			$status = "${status}_WritebackSlow";
		}

		log_output("time " . time() . ": $pid ($name) Stalled: $stalled ms: $where\n");
		log_output("Guessing: $status\n");
		log_printf("-%-15s %12d\n", "nr_dirty",        $stalled{$pid}->{VMSTAT_AT_STALL}[NR_DIRTY]);
		log_printf("-%-15s %12d\n", "nr_writeback",    $stalled{$pid}->{VMSTAT_AT_STALL}[NR_WRITEBACK]);
		log_printf("-%-15s %12d\n", "nr_vmscan_write", $stalled{$pid}->{VMSTAT_AT_STALL}[VMSCAN_WRITE]);
		log_printf("%s", $stalled{$pid}->{BLOCKSTAT_AT_STALL});
		log_printf("+%-15s %12d %12d\n", "nr_dirty",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_DIRTY], $delta_dirty);
		log_printf("+%-15s %12d %12d\n", "nr_writeback",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_WRITEBACK], $delta_writeback);
		log_printf("+%-15s %12d %12d\n", "nr_vmscan_write",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[VMSCAN_WRITE],
			$delta_vmscan_write);
		log_printf("%s", $stalled{$pid}->{BLOCKSTAT_AT_COMPLETE});
		log_output($stalled{$pid}->{STACKTRACE});

		delete($stalled{$pid});
	}

	if ($event eq "S") {
		$stalled{$pid}->{NAME} = $name;
		$stalled{$pid}->{STACKTRACE} = read_stacktrace($pid);
		$stalled{$pid}->{VMSTAT_AT_STALL} = read_vmstat();
		$stalled{$pid}->{BLOCKSTAT_AT_STALL} = read_blockstat("-");
		$stalled{$pid}->{STALLFUNCTION} = $where;
	}
}

cleanup();
exit(0);
__END__
function time () { return gettimeofday_ms() }
global stall_threshold = 1000
global time_units = "ms"
global name, stalled_at, stalled, where

probe timer.profile {
	foreach (tid+ in stalled_at) {
		if ([tid] in stalled) continue

		stall_time = time() - stalled_at[tid]
		if (stall_time >= stall_threshold) {
			printf ("S %d (%s) %d %s %s\n", tid, name[tid], stall_time, time_units, where[tid])
			stalled[tid] = 1 # defer further reports to wakeup
		}
	}
}


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]