Re: [PATCH] mm,hwpoison: non-current task should be checked early_kill for force_early

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 18 Jan 2021 08:57:47 +0000
HORIGUCHI NAOYA(堀口 直也) <naoya.horiguchi@xxxxxxx> wrote:

> On Mon, Jan 18, 2021 at 04:15:12PM +0800, Aili Yao wrote:
> > On Mon, 18 Jan 2021 06:50:54 +0000
> > HORIGUCHI NAOYA(堀口 直也) <naoya.horiguchi@xxxxxxx> wrote:
> >   
> > > 
> > > For action optional cases, one error event kills *only one* process. If an
> > > error page are shared by multiple processes, these processes will be killed
> > > by separate error events, each of which is triggered when each process tries
> > > to access the error memory.  So these processes would be killed immediately
> > > when accessing the error, but you don't have to kill all at the same time
> > > (or actually you might not even have to kill it at all if the process exits
> > > finally without accessing the error later).
> > > 
> > > Maybe the function variable "force_early" is named confusingly (it sounds
> > > that it's related to PF_MCE_KILL_EARLY flag, but that's incorrect).
> > > I'll submit a fix later.  (I'll add your "Reported-by" because you made me
> > > find it, thank you.)
> > >   
> > I think we should do more for non current process error case, we should mark it AO for processes to be signaled
> > or we may take wrong action.  
> 
> I'm not sure what you mean by "non current process error case" and "we
> should mark it AO", so could you explain more specifically about your error
> scenario?  Especially I'd like to know about who triggers hard offline on
> what hardware events and what "wrong action" could happen.  Maybe just
> "calling memory_failure() with MF_ACTION_REQUIRED" is not enough, because
> it's not enough for us to see that your scenario is possible. Current
> implementation implicitly assumes some hardware behavior, and does not work
> for the case which never happens under the assumption.
> 
> Do you have some test cases to reproduce any specific issue (like data lost)
> on your system? (If yes, please share it.) Or your concern is from code review?

Hope this draft test code will be helpful, Thanks


-- 
Best Regards!

Aili Yao
#define _GNU_SOURCE
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <hugetlbfs.h>
#include <semaphore.h>
#include <dirent.h>
#include <sys/mman.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <setjmp.h>
#include <signal.h>
#include <sched.h>


char empty[1024*1024*2] = {1};
sigjmp_buf recover;

/*
 * Set early/late kill mode for hwpoison memory corruption.
 * This influences when the process gets killed on a memory corruption.
 */
#define PR_MCE_KILL     33
# define PR_MCE_KILL_CLEAR   0
# define PR_MCE_KILL_SET     1

# define PR_MCE_KILL_LATE    0
# define PR_MCE_KILL_EARLY   1
# define PR_MCE_KILL_DEFAULT 2

#define PR_MCE_KILL_GET 34

sem_t sem_id; 
sem_t sem_id_triggrer; 
int global_fd = -1;
int * main_addr = NULL;


#define PAGE_SHIFT 21
#define PAGEMAP_LENGTH 8

#define handle_error_en(en, msg) \
       do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)

#define handle_error(msg) \
       do { perror(msg); exit(EXIT_FAILURE); } while (0)

struct thread_info {    /* Used as argument to thread_start() */
   pthread_t thread_id;        /* ID returned by pthread_create() */
   int       thread_num;       /* Application-defined thread # */
   char     *argv_string;      /* From command-line argument */
};

unsigned long get_page_frame_number_of_address(void *addr) {
   // Open the pagemap file for the current process
   FILE *pagemap = fopen("/proc/self/pagemap", "rb");

   // Seek to the page that the buffer is on
   unsigned long offset = (unsigned long)addr / getpagesize() * PAGEMAP_LENGTH;
   if(fseek(pagemap, (unsigned long)offset, SEEK_SET) != 0) {
      fprintf(stderr, "Failed to seek pagemap to proper location\n");
      exit(1);
   }

   // The page frame number is in bits 0-54 so read the first 7 bytes and clear the 55th bit
   unsigned long page_frame_number = 0;
   fread(&page_frame_number, 1, PAGEMAP_LENGTH-1, pagemap);

   page_frame_number &= 0x7FFFFFFFFFFFFF;

   fclose(pagemap);

   return page_frame_number;
}

void sighandler(int sig, siginfo_t *si, void *arg)
{
	printf("signal %d code %d addr %p\n", sig, si->si_code, si->si_addr);
	siglongjmp(recover, 1);
}

void *alloc_filebacked_page(char *filepath, int private, int *fd)
{
	int i, sum,mapflag = MAP_SHARED;
	int *addr;

	if(*fd == -1){
		if ((*fd = open(filepath, O_RDWR|O_CREAT, 0644)) < 0) {
			perror("open");
			return NULL;
		}
	}

	if(private)
		write(*fd, empty, sizeof(empty));

	if ((addr = mmap(0, getpagesize(),PROT_READ|PROT_WRITE, mapflag, *fd, 0)) == MAP_FAILED) {
		perror("mmap");
		return NULL;
	}
	for (i = 0; i < getpagesize()/4; i = i + 4)
	{
		sum = addr[i];
	}
	return addr;
}

void *alloc_anony_page(void)
{
	int i;
	void *addr;

	if ((addr = mmap(0, getpagesize(),PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, 0, 0)) == MAP_FAILED) {
		perror("mmap");
		//unlink(filepath);
		return NULL;
	}
	memcpy(addr, empty, 4096);
	return addr;
}

struct sigaction sa = {
	.sa_sigaction = sighandler,
	.sa_flags = SA_SIGINFO
};

static void * thread_start(void *arg)
{
	struct thread_info *tinfo = arg;
	char *uargv;
	char filepath[32] = {0};
	int i,sum,fd = -1;
	int *addr = main_addr;
	unsigned int page_frame_number;
	pid_t pid = getpid();
	pid_t ppid = getppid();
	char file_name[20] ={0};	
	
	printf("Thread %d %d; argv_string=%s\n",pid, ppid , tinfo->argv_string);
	uargv = strdup(tinfo->argv_string);
	if (uargv == NULL)
		handle_error("strdup");
	
	if(!strcmp(uargv,"test"))
	{
		printf("test : Page frame: 0x%x \n",addr);
		while(1)
		{
			for (i = 0; i < getpagesize()/4; i+=4)
			{
				sum = ((volatile int *)addr)[i];
				((volatile int *)addr)[i] = i;
			}
			usleep(5000);
		}
	}else{
		printf("%s Page frame: 0x%x \n", uargv, addr);
		sigaction(SIGBUS, &sa, NULL);
		if(sigsetjmp(recover,1)){
			exit;
		}
		while(1)
			sleep(1);
	}
	return uargv;
}

int main(int argc, char *argv[])
{
	int i,s, opt, num_threads;
	size_t stack_size;
	void *res;
	int main_fd = -1;
	unsigned int main_page_frame_number;
	pid_t pid,ppid;
    cpu_set_t mask;
    CPU_ZERO(&mask);

	/* The "-s" option specifies a stack size for our threads */

	stack_size = -1;
	while ((opt = getopt(argc, argv, "s:")) != -1) {
		switch (opt) {
		case 's':
			if (prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0, 0) < 0)
				err("PR_MCE_KILL_SET early");
		break;
		default:
		fprintf(stderr, "Usage: %s [-s stack-size] arg...\n", argv[0]);
		exit(EXIT_FAILURE);
		}
	}

	num_threads = argc - optind;

	/* Allocate memory for pthread_create() arguments */
	struct thread_info *tinfo = calloc(num_threads, sizeof(*tinfo));
	if (tinfo == NULL)
		handle_error("calloc");
	
	main_addr = alloc_filebacked_page("./main_test", 1, &global_fd);
	//main_addr = create_buffer();
	pid = getpid();
	ppid = getppid();
	
#if 1

#endif


	main_page_frame_number = get_page_frame_number_of_address((void *)(&(main_addr[0])));
	printf("pid:%d ppid:%d  Page frame: 0x%x 0x%x \n", pid,ppid, main_page_frame_number, (void *)(&(main_addr[0])));

	for (int tnum = 0; tnum < num_threads; tnum++) {
		tinfo[tnum].thread_num = tnum + 1;
		tinfo[tnum].argv_string = argv[optind + tnum];
		pid = fork();  
		
		if(pid<0)	  
			printf("error in fork!\n"); 	
		
		else if(pid == 0)	  
		{	  
			prctl(PR_SET_NAME, tinfo[tnum].argv_string);
			CPU_SET(tinfo[tnum].thread_num, &mask);
			if (sched_setaffinity(0, sizeof(mask), &mask) < 0)
		    {
		        return -1;
		    }
			thread_start(&tinfo[tnum]);
		}	  
		else   
		{	  
			//printf("I am the parent process,ID is %d\n",getpid());   
		}	  

	}

	while(1)
		sleep(1);
	
	free(tinfo);
	exit(EXIT_SUCCESS);
}

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux