Re: [PATCH] libceph: use ceph_kvmalloc() for osdmap arrays

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 2019-09-10 at 21:41 +0200, Ilya Dryomov wrote:
> osdmap has a bunch of arrays that grow linearly with the number of
> OSDs.  osd_state, osd_weight and osd_primary_affinity take 4 bytes per
> OSD.  osd_addr takes 136 bytes per OSD because of sockaddr_storage.
> The CRUSH workspace area also grows linearly with the number of OSDs.
> 
> Normally these arrays are allocated at client startup.  The osdmap is
> usually updated in small incrementals, but once in a while a full map
> may need to be processed.  For a cluster with 10000 OSDs, this means
> a bunch of 40K allocations followed by a 1.3M allocation, all of which
> are currently required to be physically contiguous.  This results in
> sporadic ENOMEM errors, hanging the client.
> 
> Go back to manually (re)allocating arrays and use ceph_kvmalloc() to
> fall back to non-contiguous allocation when necessary.
> 
> Link: https://tracker.ceph.com/issues/40481
> Signed-off-by: Ilya Dryomov <idryomov@xxxxxxxxx>
> ---
>  net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------
>  1 file changed, 43 insertions(+), 26 deletions(-)
> 
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 90437906b7bc..4e0de14f80bb 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
>  				 struct ceph_pg_pool_info, node);
>  		__remove_pg_pool(&map->pg_pools, pi);
>  	}
> -	kfree(map->osd_state);
> -	kfree(map->osd_weight);
> -	kfree(map->osd_addr);
> -	kfree(map->osd_primary_affinity);
> -	kfree(map->crush_workspace);
> +	kvfree(map->osd_state);
> +	kvfree(map->osd_weight);
> +	kvfree(map->osd_addr);
> +	kvfree(map->osd_primary_affinity);
> +	kvfree(map->crush_workspace);
>  	kfree(map);
>  }
>  
> @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
>   *
>   * The new elements are properly initialized.
>   */
> -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
> +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
>  {
>  	u32 *state;
>  	u32 *weight;
>  	struct ceph_entity_addr *addr;
> +	u32 to_copy;
>  	int i;
>  
> -	state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
> -	if (!state)
> -		return -ENOMEM;
> -	map->osd_state = state;
> +	dout("%s old %u new %u\n", __func__, map->max_osd, max);
> +	if (max == map->max_osd)
> +		return 0;
>  
> -	weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
> -	if (!weight)
> +	state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
> +	weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
> +	addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);

Is GFP_NOFS sufficient here, given that this may be called from rbd?
Should we be using NOIO instead (or maybe the PF_MEMALLOC_* equivalent)?

> +	if (!state || !weight || !addr) {
> +		kvfree(state);
> +		kvfree(weight);
> +		kvfree(addr);
>  		return -ENOMEM;
> -	map->osd_weight = weight;
> +	}
>  
> -	addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
> -	if (!addr)
> -		return -ENOMEM;
> -	map->osd_addr = addr;
> +	to_copy = min(map->max_osd, max);
> +	if (map->osd_state) {
> +		memcpy(state, map->osd_state, to_copy * sizeof(*state));
> +		memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
> +		memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
> +		kvfree(map->osd_state);
> +		kvfree(map->osd_weight);
> +		kvfree(map->osd_addr);
> +	}
>  
> +	map->osd_state = state;
> +	map->osd_weight = weight;
> +	map->osd_addr = addr;
>  	for (i = map->max_osd; i < max; i++) {
>  		map->osd_state[i] = 0;
>  		map->osd_weight[i] = CEPH_OSD_OUT;
> @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
>  	if (map->osd_primary_affinity) {
>  		u32 *affinity;
>  
> -		affinity = krealloc(map->osd_primary_affinity,
> -				    max*sizeof(*affinity), GFP_NOFS);
> +		affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
> +					 GFP_NOFS);
>  		if (!affinity)
>  			return -ENOMEM;
> -		map->osd_primary_affinity = affinity;
>  
> +		memcpy(affinity, map->osd_primary_affinity,
> +		       to_copy * sizeof(*affinity));
> +		kvfree(map->osd_primary_affinity);
> +
> +		map->osd_primary_affinity = affinity;
>  		for (i = map->max_osd; i < max; i++)
>  			map->osd_primary_affinity[i] =
>  			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
> @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
>  
>  	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
>  	dout("%s work_size %zu bytes\n", __func__, work_size);
> -	workspace = kmalloc(work_size, GFP_NOIO);
> +	workspace = ceph_kvmalloc(work_size, GFP_NOIO);
>  	if (!workspace) {
>  		crush_destroy(crush);
>  		return -ENOMEM;
> @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
>  
>  	if (map->crush)
>  		crush_destroy(map->crush);
> -	kfree(map->crush_workspace);
> +	kvfree(map->crush_workspace);
>  	map->crush = crush;
>  	map->crush_workspace = workspace;
>  	return 0;
> @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
>  	if (!map->osd_primary_affinity) {
>  		int i;
>  
> -		map->osd_primary_affinity = kmalloc_array(map->max_osd,
> -							  sizeof(u32),
> -							  GFP_NOFS);
> +		map->osd_primary_affinity = ceph_kvmalloc(
> +		    array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
> +		    GFP_NOFS);
>  		if (!map->osd_primary_affinity)
>  			return -ENOMEM;
>  
> @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end,
>  
>  	ceph_decode_32_safe(p, end, len, e_inval);
>  	if (len == 0) {
> -		kfree(map->osd_primary_affinity);
> +		kvfree(map->osd_primary_affinity);
>  		map->osd_primary_affinity = NULL;
>  		return 0;
>  	}

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux