Re: Ext4 mballoc behavior with mb_optimize_scan=1

Jan Kara <jack@xxxxxxx> · Thu, 28 Jul 2022 17:18:27 +0200

Hello,

attached is the C program I use for reproducing the mballoc performance issue.
I run it as:

stress-unlink -s -c 100 -f 22528 16 0 /mnt

								Honza

On Wed 27-07-22 12:51:23, Jan Kara wrote:
> Hello,
> 
> before going on vacation I was tracking down why reaim benchmark regresses
> (10-20%) with larger number of processes with the new mb_optimize_scan
> strategy of mballoc. After a while I have reproduced the regression with a
> simple benchmark that just creates, fsyncs, and deletes lots of small files
> (22k) from 16 processes, each process has its own directory. The immediate
> reason for the slow down is that with mb_optimize_scan=1 the file blocks
> are spread among more block groups and thus we have more bitmaps to update
> in each transaction. 
> 
> So the question is why mballoc with mb_optimize_scan=1 spreads allocations
> more among block groups. The situation is somewhat obscured by group
> preallocation feature of mballoc where each *CPU* holds a preallocation and
> small (below 64k) allocations on that CPU are allocated from this
> preallocation. If I trace creating of these group preallocations I can see
> that the block groups they are taken from look like:
> 
> mb_optimize_scan=0:
> 49 81 113 97 17 33 113 49 81 33 97 113 81 1 17 33 33 81 1 113 97 17 113 113
> 33 33 97 81 49 81 17 49
> 
> mb_optimize_scan=1:
> 127 126 126 125 126 127 125 126 127 124 123 124 122 122 121 120 119 118 117
> 116 115 116 114 113 111 110 109 108 107 106 105 104 104
> 
> So we can see that while with mb_optimize_scan=0 the preallocation is
> always take from one of a few groups (among which we jump mostly randomly)
> which mb_optimize_scan=1 we consistently drift from higher block groups to
> lower block groups.
> 
> The mb_optimize_scan=0 behavior is given by the fact that search for free
> space always starts in the same block group where the inode is allocated
> and the inode is always allocated in the same block group as its parent
> directory. So the create-delete benchmark generally keeps all inodes for
> one process in the same block group and thus allocations are always
> starting in that block group. Because files are small, we always succeed in
> finding free space in the starting block group and thus allocations are
> generally restricted to the several block groups where parent directories
> were originally allocated.
> 
> With mb_optimize_scan=1 the block group to allocate from is selected by
> ext4_mb_choose_next_group_cr0() so in this mode we completely ignore the
> "pack inode with data in the same group" rule. The reason why we keep
> drifting among block groups is that whenever free space in a block group is
> updated (blocks allocated / freed) we recalculate largest free order (see
> mb_mark_used() and mb_free_blocks()) and as a side effect that removes
> group from the bb_largest_free_order_node list and reinserts the group at
> the tail.
> 
> I have two questions about the mb_optimize_scan=1 strategy:
> 
> 1) Shouldn't we respect the initial goal group and try to allocate from it
> in ext4_mb_regular_allocator() before calling ext4_mb_choose_next_group()?
> 
> 2) The rotation of groups in mb_set_largest_free_order() seems a bit
> undesirable to me. In particular it seems pointless if the largest free
> order does not change. Was there some rationale behind it?
> 
> 								Honza
> -- 
> Jan Kara <jack@xxxxxxxx>
> SUSE Labs, CR
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR
#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <getopt.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>

#define COUNT 5000
#define SIZE 16384
#define MAX_PROCS 1024

char shm_name[64];
char *wbuf;
int call_sync, use_falloc;
int count = COUNT;
int size = SIZE;

void prepare(char *base, int num)
{
	char dir[128];

	sprintf(dir, "%s/dir-%d", base, num);
	if (mkdir(dir, 0700) < 0 && errno != EEXIST) {
		perror("mkdir");
		exit(1);
	}
}

void teardown(char *base, int num)
{
	char dir[128];

	sprintf(dir, "%s/dir-%d", base, num);
	rmdir(dir);
}

void run_test(char *base, int num)
{
	char name[128];
	int shmfd;
	int i;
	int *shm;
	int fd;

	sprintf(name, "%s/dir-%d/file", base, num);
	shmfd = shm_open(shm_name, O_RDWR, 0);
	if (shmfd < 0) {
		perror("shm_open");
		exit(1);
	}
	shm = mmap(NULL, sizeof(int) * MAX_PROCS, PROT_READ | PROT_WRITE,
		   MAP_SHARED, shmfd, 0);
	if (shm == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	shm[num + 1] = 1;
	while (shm[0] == 0)
		usleep(1);

	for (i = 0; i < count; i++) {
		fd = open(name, O_CREAT | O_WRONLY, 0644);
		if (fd < 0) {
			perror("open");
			exit(1);
		}
		if (use_falloc) {
			if (fallocate(fd, 0, 0, size) < 0) {
				perror("fallocate");
				exit(1);
			}
		} else {
			if (write(fd, wbuf, size) != size) {
				perror("write");
				exit(1);
			}
		}
		if (call_sync)
			fsync(fd);
		close(fd);
		unlink(name);
	}
}

pid_t start_flusher(char *base, int num)
{
	int fd;
	char name[128];
	pid_t pid;

	pid = fork();
	if (pid < 0) {
		perror("fork");
		return -1;
	}
	if (pid > 0)
		return pid;
	sprintf(name, "%s/sync-file-%d", base, num);
	fd = open(name, O_CREAT | O_TRUNC | O_WRONLY, 0644);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	while (1) {
		if (pwrite(fd, wbuf, size, 0) != size) {
			perror("pwrite");
			exit(1);
		}
		fsync(fd);
	}
}

int main(int argc, char **argv)
{
	int procs, sync_procs, i, j;
	pid_t pids[MAX_PROCS];
	pid_t sync_pids[MAX_PROCS];
	int shmfd;
	int *shm;
	struct timeval start, end;
	long long ms;
	int opt;
	char *path;

	while ((opt = getopt(argc, argv, "sc:f:a")) != -1) {
		switch (opt) {
		case 's':
			call_sync = 1;
			break;
		case 'a':
			use_falloc = 1;
			break;
		case 'c':
			count = atoi(optarg);
			break;
		case 'f':
			size = atoi(optarg);
			break;
		default:
usage:
			fprintf(stderr, "Usage: stress-unlink [-s] [-c <count>] [-f <filesize>] <processes> <flushers> <dir>\n");
			return 1;
		}
	}
	if (argc - optind != 3)
		goto usage;

	procs = strtol(argv[optind++], NULL, 10);
	if (procs > MAX_PROCS) {
		fprintf(stderr, "Too many processes!\n");
		return 1;
	}
	sync_procs = strtol(argv[optind++], NULL, 10);
	if (sync_procs > MAX_PROCS) {
		fprintf(stderr, "Too many flusher processes!\n");
		return 1;
	}
	path = argv[optind];

	wbuf = malloc(size);
	memset(wbuf, 0xcc, size);
	sprintf(shm_name, "/stress-unlink-%u", getpid());
	shmfd = shm_open(shm_name, O_CREAT | O_RDWR, 0600);
	if (shmfd < 0) {
		perror("shm_open");
		return 1;
	}
	if (ftruncate(shmfd, sizeof(int) * MAX_PROCS) < 0) {
		perror("ftruncate shm");
		return 1;
	}
	shm = mmap(NULL, sizeof(int) * MAX_PROCS, PROT_READ | PROT_WRITE,
		   MAP_SHARED, shmfd, 0);
	if (shm == MAP_FAILED) {
		perror("mmap");
		return 1;
	}

	shm[0] = 0;
	for (i = 0; i < procs; i++) {
		shm[i+1] = 0;
		prepare(path, i);
	}

	for (i = 0; i < procs; i++) {
		pids[i] = fork();
		if (pids[i] < 0) {
			perror("fork");
			for (j = 0; j < i; j++)
				kill(pids[j], SIGKILL);
			return 1;
		}
		if (pids[i] == 0) {
			run_test(path, i);
			exit(0);
		}
	}

	for (i = 0; i < sync_procs; i++) {
		sync_pids[i] = start_flusher(path, i);
		if (sync_pids[i] < 0) {
			for (j = 0; j < procs; j++)
				kill(pids[j], SIGKILL);
			for (j = 0; j < i; i++)
				kill(sync_pids[i], SIGKILL);
			exit(1);
		}
	}

	do {
		for (i = 0; i < procs && shm[i + 1]; i++);
	} while (i != procs);
	gettimeofday(&start, NULL);
	shm[0] = 1;
	fprintf(stderr, "Processes started.\n");

	for (i = 0; i < procs; i++)
		waitpid(pids[i], NULL, 0);
	gettimeofday(&end, NULL);
	for (i = 0; i < procs; i++)
		teardown(path, i);
	shm_unlink(shm_name);
	for (i = 0; i < sync_procs; i++)
		kill(sync_pids[i], SIGKILL);
	for (i = 0; i < sync_procs; i++)
		waitpid(sync_pids[i], NULL, 0);
	ms = (((long long)(end.tv_sec - start.tv_sec) * 1000000) +
		(end.tv_usec - start.tv_usec)) / 1000;
	printf("%lld.%03lld\n", ms/1000, ms%1000);

	return 0;
}