Re: [RFC 1/2]swap: add a simple buddy allocator

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Shaohua,

Your idea does make sense to me.
Below just a few nitpick.

On Mon, Oct 22, 2012 at 10:30:51AM +0800, Shaohua Li wrote:
> I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30%
> CPU time (when cluster is hard to find), which becomes a bottleneck.
> scan_swap_map() scans a byte array to search a 256 page cluster, which is very
> slow.
> 
> Here I introduced a simple buddy allocator. Since we only care about 256 pages
> cluster, we can just use a counter to implement the buddy allocator. Every 256
> pages use one int to store the counter, so searching cluster is very efficient.
> With this, scap_swap_map() overhead disappears.
> 
> This might help low end SD card swap too. Because if the cluster is aligned, SD
> firmware can do flash erase more efficiently.

Indeed.

> 
> The downside is the cluster must be aligned to 256 pages, which will reduce the
> chance to find a cluster.

It would be not good for roration device. Can't we make it for only discardable
device?

> 
> Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx>
> ---
>  include/linux/swap.h |    1 
>  mm/swapfile.c        |   74 +++++++++++++++++++++++++++++++++++++++------------
>  2 files changed, 58 insertions(+), 17 deletions(-)
> 
> Index: linux/include/linux/swap.h
> ===================================================================
> --- linux.orig/include/linux/swap.h	2012-10-22 09:20:40.802165198 +0800
> +++ linux/include/linux/swap.h	2012-10-22 09:20:50.462043746 +0800
> @@ -185,6 +185,7 @@ struct swap_info_struct {
>  	signed char	next;		/* next type on the swap list */
>  	unsigned int	max;		/* extent of the swap_map */
>  	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
> +	unsigned int *swap_cluster_count; /* cluster counter */

swap_cluster_count is rather awkward.

How about this?
        unsigned int *inuse_pages_in_cluster; /* used pages in a cluster */

>  	unsigned int lowest_bit;	/* index of first free in swap_map */
>  	unsigned int highest_bit;	/* index of last free in swap_map */
>  	unsigned int pages;		/* total of usable pages of swap */
> Index: linux/mm/swapfile.c
> ===================================================================
> --- linux.orig/mm/swapfile.c	2012-10-22 09:20:40.794165269 +0800
> +++ linux/mm/swapfile.c	2012-10-22 09:21:34.317493506 +0800
> @@ -182,6 +182,22 @@ static int wait_for_discard(void *word)
>  #define SWAPFILE_CLUSTER	256
>  #define LATENCY_LIMIT		256
>  
> +static inline void inc_swap_cluster_count(unsigned int *swap_cluster_count,
> +					unsigned long page_nr)
> +{
> +	swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> +	VM_BUG_ON(*swap_cluster_count >= SWAPFILE_CLUSTER);
> +	(*swap_cluster_count)++;
> +}

How about this?

static inline void inc_swap_map(struct swap_info_struct *si, unsigned long offset)
{
        int cluster_idx = offset / SWAPFILE_CLUSTER;
        si->inuse_pages_in_cluster[cluster_idx]++;
}

> +
> +static inline void dec_swap_cluster_count(unsigned int *swap_cluster_count,
> +					unsigned long page_nr)
> +{
> +	swap_cluster_count += page_nr/SWAPFILE_CLUSTER;
> +	VM_BUG_ON(*swap_cluster_count == 0);
> +	(*swap_cluster_count)--;
> +}

static inline void dec_swap_map(struct swap_info_struct *si, unsigned long offset)
{
        int cluster_idx = offset / SWAPFILE_CLUSTER;
        si->inuse_pages_in_cluster[cluster_idx]--;
}

> +
>  static unsigned long scan_swap_map(struct swap_info_struct *si,
>  				   unsigned char usage)
>  {
> @@ -206,6 +222,8 @@ static unsigned long scan_swap_map(struc
>  	scan_base = offset = si->cluster_next;
>  
>  	if (unlikely(!si->cluster_nr--)) {
> +		unsigned long base;
> +
>  		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
>  			si->cluster_nr = SWAPFILE_CLUSTER - 1;
>  			goto checks;
> @@ -235,15 +253,14 @@ static unsigned long scan_swap_map(struc
>  		 */
>  		if (!(si->flags & SWP_SOLIDSTATE))
>  			scan_base = offset = si->lowest_bit;
> -		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>  
> -		/* Locate the first empty (unaligned) cluster */
> -		for (; last_in_cluster <= si->highest_bit; offset++) {
> -			if (si->swap_map[offset])
> -				last_in_cluster = offset + SWAPFILE_CLUSTER;
> -			else if (offset == last_in_cluster) {
> +		/* Locate the first empty (aligned) cluster */
> +		for (base = round_up(offset, SWAPFILE_CLUSTER);
> +		     base <= si->highest_bit; base += SWAPFILE_CLUSTER) {
> +			if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> +				offset = base;
> +				last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>  				spin_lock(&swap_lock);
> -				offset -= SWAPFILE_CLUSTER - 1;
>  				si->cluster_next = offset;
>  				si->cluster_nr = SWAPFILE_CLUSTER - 1;
>  				found_free_cluster = 1;
> @@ -256,15 +273,14 @@ static unsigned long scan_swap_map(struc
>  		}
>  
>  		offset = si->lowest_bit;
> -		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>  
> -		/* Locate the first empty (unaligned) cluster */
> -		for (; last_in_cluster < scan_base; offset++) {
> -			if (si->swap_map[offset])
> -				last_in_cluster = offset + SWAPFILE_CLUSTER;
> -			else if (offset == last_in_cluster) {
> +		/* Locate the first empty (aligned) cluster */
> +		for (base = round_up(offset, SWAPFILE_CLUSTER);
> +		     base < scan_base; base += SWAPFILE_CLUSTER) {
> +			if (!si->swap_cluster_count[base/SWAPFILE_CLUSTER]) {
> +				offset = base;
> +				last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
>  				spin_lock(&swap_lock);
> -				offset -= SWAPFILE_CLUSTER - 1;
>  				si->cluster_next = offset;
>  				si->cluster_nr = SWAPFILE_CLUSTER - 1;
>  				found_free_cluster = 1;
> @@ -315,6 +331,7 @@ checks:
>  		si->highest_bit = 0;
>  	}
>  	si->swap_map[offset] = usage;
> +	inc_swap_cluster_count(si->swap_cluster_count, offset);
>  	si->cluster_next = offset + 1;
>  	si->flags -= SWP_SCANNING;
>  
> @@ -549,6 +566,7 @@ static unsigned char swap_entry_free(str
>  
>  	/* free if no reference */
>  	if (!usage) {
> +		dec_swap_cluster_count(p->swap_cluster_count, offset);
>  		if (offset < p->lowest_bit)
>  			p->lowest_bit = offset;
>  		if (offset > p->highest_bit)
> @@ -1445,6 +1463,7 @@ static int setup_swap_extents(struct swa
>  
>  static void enable_swap_info(struct swap_info_struct *p, int prio,
>  				unsigned char *swap_map,
> +				unsigned int *swap_cluster_count,
>  				unsigned long *frontswap_map)
>  {
>  	int i, prev;
> @@ -1455,6 +1474,7 @@ static void enable_swap_info(struct swap
>  	else
>  		p->prio = --least_priority;
>  	p->swap_map = swap_map;
> +	p->swap_cluster_count = swap_cluster_count;
>  	frontswap_map_set(p, frontswap_map);
>  	p->flags |= SWP_WRITEOK;
>  	nr_swap_pages += p->pages;
> @@ -1480,6 +1500,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
>  {
>  	struct swap_info_struct *p = NULL;
>  	unsigned char *swap_map;
> +	unsigned int *swap_cluster_count;
>  	struct file *swap_file, *victim;
>  	struct address_space *mapping;
>  	struct inode *inode;
> @@ -1556,7 +1577,8 @@ SYSCALL_DEFINE1(swapoff, const char __us
>  		 * sys_swapoff for this swap_info_struct at this point.
>  		 */
>  		/* re-insert swap space back into swap_list */
> -		enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
> +		enable_swap_info(p, p->prio, p->swap_map, p->swap_cluster_count,
> +				frontswap_map_get(p));
>  		goto out_dput;
>  	}
>  
> @@ -1581,11 +1603,14 @@ SYSCALL_DEFINE1(swapoff, const char __us
>  	p->max = 0;
>  	swap_map = p->swap_map;
>  	p->swap_map = NULL;
> +	swap_cluster_count = p->swap_cluster_count;
> +	p->swap_cluster_count = NULL;
>  	p->flags = 0;
>  	frontswap_invalidate_area(type);
>  	spin_unlock(&swap_lock);
>  	mutex_unlock(&swapon_mutex);
>  	vfree(swap_map);
> +	vfree(swap_cluster_count);
>  	vfree(frontswap_map_get(p));
>  	/* Destroy swap account informatin */
>  	swap_cgroup_swapoff(type);
> @@ -1896,6 +1921,7 @@ static unsigned long read_swap_header(st
>  static int setup_swap_map_and_extents(struct swap_info_struct *p,
>  					union swap_header *swap_header,
>  					unsigned char *swap_map,
> +					unsigned int *swap_cluster_count,
>  					unsigned long maxpages,
>  					sector_t *span)
>  {
> @@ -1912,11 +1938,16 @@ static int setup_swap_map_and_extents(st
>  		if (page_nr < maxpages) {
>  			swap_map[page_nr] = SWAP_MAP_BAD;
>  			nr_good_pages--;
> +			inc_swap_cluster_count(swap_cluster_count, page_nr);
>  		}
>  	}
>  
> +	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
> +		inc_swap_cluster_count(swap_cluster_count, i);
> +
>  	if (nr_good_pages) {
>  		swap_map[0] = SWAP_MAP_BAD;
> +		inc_swap_cluster_count(swap_cluster_count, 0);
>  		p->max = maxpages;
>  		p->pages = nr_good_pages;
>  		nr_extents = setup_swap_extents(p, span);
> @@ -1946,6 +1977,7 @@ SYSCALL_DEFINE2(swapon, const char __use
>  	sector_t span;
>  	unsigned long maxpages;
>  	unsigned char *swap_map = NULL;
> +	unsigned int *swap_cluster_count = NULL;
>  	unsigned long *frontswap_map = NULL;
>  	struct page *page = NULL;
>  	struct inode *inode = NULL;
> @@ -2020,12 +2052,19 @@ SYSCALL_DEFINE2(swapon, const char __use
>  		goto bad_swap;
>  	}
>  
> +	swap_cluster_count = vzalloc(DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER) *
> +					sizeof(int));
> +	if (!swap_cluster_count) {
> +		error = -ENOMEM;
> +		goto bad_swap;
> +	}
> +
>  	error = swap_cgroup_swapon(p->type, maxpages);
>  	if (error)
>  		goto bad_swap;
>  
>  	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
> -		maxpages, &span);
> +		swap_cluster_count, maxpages, &span);
>  	if (unlikely(nr_extents < 0)) {
>  		error = nr_extents;
>  		goto bad_swap;
> @@ -2048,7 +2087,7 @@ SYSCALL_DEFINE2(swapon, const char __use
>  	if (swap_flags & SWAP_FLAG_PREFER)
>  		prio =
>  		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
> -	enable_swap_info(p, prio, swap_map, frontswap_map);
> +	enable_swap_info(p, prio, swap_map, swap_cluster_count, frontswap_map);
>  
>  	printk(KERN_INFO "Adding %uk swap on %s.  "
>  			"Priority:%d extents:%d across:%lluk %s%s%s\n",
> @@ -2078,6 +2117,7 @@ bad_swap:
>  	p->flags = 0;
>  	spin_unlock(&swap_lock);
>  	vfree(swap_map);
> +	vfree(swap_cluster_count);
>  	if (swap_file) {
>  		if (inode && S_ISREG(inode->i_mode)) {
>  			mutex_unlock(&inode->i_mutex);
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]