Re: kernel oops on mmotm-2015-10-15-15-20

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 22, 2015 at 10:21:36AM +0900, Minchan Kim wrote:
> Hello Hugh,
> 
> On Wed, Oct 21, 2015 at 05:59:59PM -0700, Hugh Dickins wrote:
> > On Thu, 22 Oct 2015, Minchan Kim wrote:
> > > 
> > > I added the code to check it and queued it again but I had another oops
> > > in this time but symptom is related to anon_vma, too.
> > > (kernel is based on recent mmotm + unconditional mkdirty for bug fix)
> > > It seems page_get_anon_vma returns NULL since the page was not page_mapped
> > > at that time but second check of page_mapped right before try_to_unmap seems
> > > to be true.
> > > 
> > > Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
> > > Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
> > > page:ffffea0001cfbfc0 count:3 mapcount:1 mapping:ffff88007f1b5f51 index:0x600000aff
> > > flags: 0x4000000000048019(locked|uptodate|dirty|swapcache|swapbacked)
> > > page dumped because: VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma)
> > 
> > That's interesting, that's one I added in my page migration series.
> > Let me think on it, but it could well relate to the one you got before.
> 
> I will roll back to mm/madv_free-v4.3-rc5-mmotm-2015-10-15-15-20
> instead of next-20151021 to remove noise from your migration cleanup
> series and will test it again.
> If it is fixed, I will test again with your migration patchset, then.

I tested mmotm-2015-10-15-15-20 with test program I attach for a long time.
Therefore, there is no patchset from Hugh's migration patch in there.
And I added below debug code with request from Kirill to all test kernels.

diff --git a/mm/rmap.c b/mm/rmap.c
index ddfb9be72366..1c23b70b1f57 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -513,6 +513,13 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
 
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
+
+       if (root_anon_vma == NULL) {
+               printk("anon_vma %p refcount %d\n", anon_vma,
+                       atomic_read(&anon_vma->refcount));
+               VM_BUG_ON_PAGE(1, page);
+       }
+
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still


1. mmotm-2015-10-15-15-20 + kirill's pte_mkdirty

1st trial:
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
BUG: Bad rss-counter state mm:ffff88007f1ed780 idx:1 val:488
BUG: Bad rss-counter state mm:ffff88007f1ed780 idx:2 val:24

2nd trial:

Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
BUG: Bad rss-counter state mm:ffff8800a5cca680 idx:1 val:512
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS

2. mmotm-2015-10-15-15-20-no-madvise_free, IOW it means git head for
54bad5da4834 arm64: add pmd_[dirty|mkclean] for THP.

1st trial:
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
BUG: Bad rss-counter state mm:ffff88007f4c2d80 idx:1 val:511
BUG: Bad rss-counter state mm:ffff88007f4c2d80 idx:2 val:1

2nd trial:
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
Adding 4191228k swap on /dev/vda5.  Priority:-1 extents:1 across:4191228k FS
anon_vma ffff880000089aa0 refcount 0
page:ffffea0001a2ea40 count:3 mapcount:1 mapping:ffff880000089aa1 index:0x6000047a9

I tested it with KVM which guest system has 12 core and 3G memory.
In mmotm-2015-10-15-15-20-no-madvise_free, I tweaked test program does
madvise_dontneed intead of madvise_free via below patch

For the testing,

        gcc -o oops oops.c
        ./memcg_test.sh

I will be off from now on so please understand late response
but I hope my test program will reproduce it in your machine.

diff --git a/oops.c b/oops.c
index e50330a..c8298f8 100644
--- a/oops.c
+++ b/oops.c
@@ -8,7 +8,7 @@
 #include <errno.h>
 #include <signal.h>
 
-#define MADV_FREE 5
+#define MADV_FREE 4
 
 int pid;

Attachment: memcg_move_task.sh
Description: Bourne shell script

Attachment: memcg_test.sh
Description: Bourne shell script

#include <sys/types.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>

#define MADV_FREE 4

int pid;

void sig_handler(int signo)
{
        printf("pid %d sig received %d\n", pid, signo);
	exit(1);
}

void free_bufs(void **bufs, unsigned long buf_count, unsigned long buf_size)
{
	int i;

	for (i = 0; i < buf_count; i++) {
		if (bufs[i] != NULL) {
			munmap(bufs[i],  buf_size);
			bufs[i] = NULL;
		}
	}
}

void alloc_bufs(void **bufs, unsigned long buf_count, unsigned long buf_size)
{
	int i;
	time_t rawtime;
	struct tm * timeinfo;
	void *addr = (void*)0x600000000000;

	for (i = 0; i < buf_count; i++) {
		void *ptr = NULL;

		ptr = mmap(addr, buf_size, PROT_READ|PROT_WRITE,
			MAP_ANON|MAP_PRIVATE|MAP_FIXED, 0, 0);

		if (ptr == MAP_FAILED) {
			char bufs[64];

			sprintf(bufs, "cat /proc/%d/maps", pid);
			printf("error to allocate %p\n", addr);

			system(bufs);
			exit(1);
		}

		addr += buf_size;
		bufs[i] = ptr;
	}
}

void fill_bufs(void **bufs, unsigned long buf_count, unsigned long buf_size)
{
	int i;
	char msg[64] = {0, };

	for (i = 0; i < buf_count; i++)
		memset(bufs[i], 'a' + i, buf_size);

	sprintf(msg, "pid %d buf_count %ld complete", pid, buf_count);
}

void madvise_bufs(void **bufs, unsigned long buf_count,
			unsigned long buf_size, int advise)
{
	int i, ret;

	for (i = 0; i < buf_count; i++) {
retry:
		if (ret = madvise(bufs[i], buf_size, advise)) {
			perror("fail to madvise\n");
			if (ret == EAGAIN) {
				sleep(1);
				goto retry;
			}
			exit(1);
		}
	}
}

void madvise_free_bufs(void **bufs, unsigned long buf_count,
			unsigned long buf_size)
{
	int i;

	for (i = 0; i < buf_count; i++) {
		if (madvise(bufs[i], buf_size, MADV_FREE)) {
			printf("[%d] bufs[%d] %p madvise_free fail\n",
				pid, i, bufs[i]);
		}
	}
}

void check_madvise_bufs(void **bufs, unsigned long buf_count,
			unsigned long buf_size, int freeable)
{
	int i, j;

	for (i = 0; i < buf_count; i++) {
		char tmp;
		void *buf = bufs[i];

		for (j = 0; j < buf_size; j++) {
			int ret;
			unsigned long addr;

			tmp = *(char*)(buf + j);
			/* The page was not purged */
			if (tmp == 'a' + i)
				continue;

			/* The page was purged */
			if (freeable && (int)tmp == 0)
				continue;

			/* Something wrong happens */
			addr = (unsigned long)(buf + j);
			printf("pid %d bufaddr %p ofs %d freeable %d expected %c but %c\n",
					pid, buf, j, freeable, 'a' + i, tmp);
			exit(1);
		}

	}
}

int main(int argc, char *argv[])
{
	int i, ret, advise;
	unsigned long buf_size, buf_count, loop;
	void **bufs;

	pid = getpid();

	if (argc != 4) {
		printf("check your argument\n");
		return 1;
	}

	buf_size = atol(argv[1]);
	buf_count = atol(argv[2]);
	advise = atol(argv[3]);

	if (buf_size & ((2<<20) - 1)) {
		printf("buf_size should be 2M aligned\n");
		return 1;
	}

	printf("[%d] buf size %ld buf_count %ld advise %d\n",
			pid, buf_size, buf_count, advise);

        if (signal(SIGINT, sig_handler) == SIG_ERR) {
                printf("Fail to register signal handler\n");
                return 1;
        }

        if (signal(SIGHUP, sig_handler) == SIG_ERR) {
                printf("Fail to register signal handler\n");
                return 1;
        }

	bufs = malloc(sizeof(void *) * buf_count);
	if (!bufs)
		return 1;

	memset(bufs, 0, sizeof(void *) * buf_count);

	srandom(pid);

	while (1) {
		int madvise_free = madvise_free = random() % 2;

		alloc_bufs(bufs, buf_count, buf_size);

		fill_bufs(bufs, buf_count, buf_size);

		/* We touched buffers so MADV_FREE cannot free pages */
		check_madvise_bufs(bufs, buf_count, buf_size, 0);

		madvise_bufs(bufs, buf_count, buf_size, advise);

		sleep(1);

		/* syscall MADV_FREE */
		madvise_free_bufs(bufs, buf_count, buf_size);

		sleep(1);

		check_madvise_bufs(bufs, buf_count, buf_size, 1);
		free_bufs(bufs, buf_count, buf_size);
	}

	return 0;
}

Attachment: oops.sh
Description: Bourne shell script

Attachment: setup_memcg.sh
Description: Bourne shell script


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]