Re: [PATCH v2 2/5] crypto/pool: Add crypto_pool_reserve_scratch()

Jakub Kicinski <kuba@xxxxxxxxxx> · Fri, 6 Jan 2023 18:04:27 -0800

On Tue,  3 Jan 2023 18:42:54 +0000 Dmitry Safonov wrote:
> Instead of having build-time hardcoded constant, reallocate scratch
> area, if needed by user. Different algos, different users may need
> different size of temp per-CPU buffer. Only up-sizing supported for
> simplicity.

> -static int crypto_pool_scratch_alloc(void)
> +/* Slow-path */
> +/**
> + * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path
> + * @size: request size for the scratch/temp buffer
> + */
> +int crypto_pool_reserve_scratch(unsigned long size)

Does this have to be a separate call? Can't we make it part of 
the pool allocation? AFAICT the scratch gets freed when last
pool is freed, so the user needs to know to allocate the pool
_first_ otherwise there's a potential race:

 CPU 1               CPU 2

 alloc pool
                    set scratch
 free pool
 [frees scratch]
                    alloc pool

>  {
> -	int cpu;
> -
> -	lockdep_assert_held(&cpool_mutex);
> +#define FREE_BATCH_SIZE		64
> +	void *free_batch[FREE_BATCH_SIZE];
> +	int cpu, err = 0;
> +	unsigned int i = 0;
>  
> +	mutex_lock(&cpool_mutex);
> +	if (size == scratch_size) {
> +		for_each_possible_cpu(cpu) {
> +			if (per_cpu(crypto_pool_scratch, cpu))
> +				continue;
> +			goto allocate_scratch;
> +		}
> +		mutex_unlock(&cpool_mutex);
> +		return 0;
> +	}
> +allocate_scratch:
> +	size = max(size, scratch_size);
> +	cpus_read_lock();
>  	for_each_possible_cpu(cpu) {
> -		void *scratch = per_cpu(crypto_pool_scratch, cpu);
> +		void *scratch, *old_scratch;
>  
> -		if (scratch)
> +		scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
> +		if (!scratch) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +
> +		old_scratch = per_cpu(crypto_pool_scratch, cpu);
> +		/* Pairs with crypto_pool_get() */
> +		WRITE_ONCE(*per_cpu_ptr(&crypto_pool_scratch, cpu), scratch);

You're using RCU for protection here, please use rcu accessors.

> +		if (!cpu_online(cpu)) {
> +			kfree(old_scratch);
>  			continue;
> +		}
> +		free_batch[i++] = old_scratch;
> +		if (i == FREE_BATCH_SIZE) {
> +			cpus_read_unlock();
> +			synchronize_rcu();
> +			while (i > 0)
> +				kfree(free_batch[--i]);
> +			cpus_read_lock();
> +		}

This is a memory allocation routine, can we simplify this by
dynamically sizing "free_batch" and using call_rcu()?

struct humf_blah {
	struct rcu_head rcu;
	unsigned int cnt;
	void *data[];
};

cheezit = kmalloc(struct_size(blah, data, num_possible_cpus()));

for_each ..
	cheezit->data[cheezit->cnt++] = old_scratch;

call_rcu(&cheezit->rcu, my_free_them_scratches)

etc.

Feels like that'd be much less locking, unlocking and general
carefully'ing.

> +	}
> +	cpus_read_unlock();
> +	if (!err)
> +		scratch_size = size;