On Sat, Sep 25, 2010 at 12:27:35PM +0800, xiaohui.xin@xxxxxxxxx wrote: > From: Xin Xiaohui <xiaohui.xin@xxxxxxxxx> > > The patch add two ioctls for mp device. > One is for userspace to query how much memory locked to make mp device > run smoothly. Another one is for userspace to set how much meory locked > it really wants. > > --- > drivers/vhost/mpassthru.c | 103 +++++++++++++++++++++++---------------------- > include/linux/mpassthru.h | 2 + > 2 files changed, 54 insertions(+), 51 deletions(-) > > diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c > index d86d94c..e3a0199 100644 > --- a/drivers/vhost/mpassthru.c > +++ b/drivers/vhost/mpassthru.c > @@ -67,6 +67,8 @@ static int debug; > #define COPY_THRESHOLD (L1_CACHE_BYTES * 4) > #define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES) > > +#define DEFAULT_NEED ((8192*2*2)*4096) > + > struct frag { > u16 offset; > u16 size; > @@ -110,7 +112,8 @@ struct page_ctor { > int rq_len; > spinlock_t read_lock; try documenting what fields are protected by which lock, btw. > /* record the locked pages */ > - int lock_pages; > + int locked_pages; > + int cur_pages; > struct rlimit o_rlim; unused now? > struct net_device *dev; > struct mpassthru_port port; This structure name should start with mp_ to avoid namespace pollution. Also ctor implies a contructor function: see pgtable_page_ctor - it is not a very good name for a structure. > @@ -122,6 +125,7 @@ struct mp_struct { > struct net_device *dev; > struct page_ctor *ctor; > struct socket socket; > + struct task_struct *user; > > #ifdef MPASSTHRU_DEBUG > int debug; > @@ -231,7 +235,8 @@ static int page_ctor_attach(struct mp_struct *mp) > ctor->port.ctor = page_ctor; > ctor->port.sock = &mp->socket; > ctor->port.hash = mp_lookup; > - ctor->lock_pages = 0; > + ctor->locked_pages = 0; > + ctor->cur_pages = 0; > > /* locked by mp_mutex */ > dev->mp_port = &ctor->port; > @@ -264,37 +269,6 @@ struct page_info *info_dequeue(struct page_ctor *ctor) > return info; > } > > -static int set_memlock_rlimit(struct page_ctor *ctor, int resource, > - unsigned long cur, unsigned long max) > -{ > - struct rlimit new_rlim, *old_rlim; > - int retval; > - > - if (resource != RLIMIT_MEMLOCK) > - return -EINVAL; > - new_rlim.rlim_cur = cur; > - new_rlim.rlim_max = max; > - > - old_rlim = current->signal->rlim + resource; > - > - /* remember the old rlimit value when backend enabled */ > - ctor->o_rlim.rlim_cur = old_rlim->rlim_cur; > - ctor->o_rlim.rlim_max = old_rlim->rlim_max; > - > - if ((new_rlim.rlim_max > old_rlim->rlim_max) && > - !capable(CAP_SYS_RESOURCE)) > - return -EPERM; > - > - retval = security_task_setrlimit(resource, &new_rlim); > - if (retval) > - return retval; > - > - task_lock(current->group_leader); > - *old_rlim = new_rlim; > - task_unlock(current->group_leader); > - return 0; > -} > - > static void relinquish_resource(struct page_ctor *ctor) > { > if (!(ctor->dev->flags & IFF_UP) && > @@ -323,7 +297,7 @@ static void mp_ki_dtor(struct kiocb *iocb) > } else > info->ctor->wq_len--; > /* Decrement the number of locked pages */ > - info->ctor->lock_pages -= info->pnum; > + info->ctor->cur_pages -= info->pnum; > kmem_cache_free(ext_page_info_cache, info); > relinquish_resource(info->ctor); > > @@ -357,6 +331,7 @@ static int page_ctor_detach(struct mp_struct *mp) > { > struct page_ctor *ctor; > struct page_info *info; > + struct task_struct *tsk = mp->user; > int i; > > /* locked by mp_mutex */ > @@ -375,9 +350,9 @@ static int page_ctor_detach(struct mp_struct *mp) > > relinquish_resource(ctor); > > - set_memlock_rlimit(ctor, RLIMIT_MEMLOCK, > - ctor->o_rlim.rlim_cur, > - ctor->o_rlim.rlim_max); > + down_write(&tsk->mm->mmap_sem); > + tsk->mm->locked_vm -= ctor->locked_pages; > + up_write(&tsk->mm->mmap_sem); > > /* locked by mp_mutex */ > ctor->dev->mp_port = NULL; > @@ -514,7 +489,6 @@ static struct page_info *mp_hash_delete(struct page_ctor *ctor, > { > key_mp_t key = mp_hash(info->pages[0], HASH_BUCKETS); > struct page_info *tmp = NULL; > - int i; > > tmp = ctor->hash_table[key]; > while (tmp) { > @@ -565,14 +539,11 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor, > int rc; > int i, j, n = 0; > int len; > - unsigned long base, lock_limit; > + unsigned long base; > struct page_info *info = NULL; > > - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; > - lock_limit >>= PAGE_SHIFT; > - > - if (ctor->lock_pages + count > lock_limit && npages) { > - printk(KERN_INFO "exceed the locked memory rlimit."); > + if (ctor->cur_pages + count > ctor->locked_pages) { > + printk(KERN_INFO "Exceed memory lock rlimt."); > return NULL; > } > > @@ -634,7 +605,7 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor, > mp_hash_insert(ctor, info->pages[i], info); > } > /* increment the number of locked pages */ > - ctor->lock_pages += j; > + ctor->cur_pages += j; > return info; > > failed: > @@ -1006,12 +977,6 @@ proceed: > count--; > } > > - if (!ctor->lock_pages || !ctor->rq_len) { > - set_memlock_rlimit(ctor, RLIMIT_MEMLOCK, > - iocb->ki_user_data * 4096 * 2, > - iocb->ki_user_data * 4096 * 2); > - } > - > /* Translate address to kernel */ > info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0); > if (!info) > @@ -1115,8 +1080,10 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd, > struct mp_struct *mp; > struct net_device *dev; > void __user* argp = (void __user *)arg; > + unsigned long __user *limitp = argp; > struct ifreq ifr; > struct sock *sk; > + unsigned long limit, locked, lock_limit; > int ret; > > ret = -EINVAL; > @@ -1152,6 +1119,7 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd, > goto err_dev_put; > } > mp->dev = dev; > + mp->user = current; This is unsafe: task might go away but fd will still exist. You should get the mm like vhost does, then you can keep a reference until release. > ret = -ENOMEM; > > sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto); > @@ -1193,6 +1161,39 @@ err_dev_put: > ret = do_unbind(mfile); > break; > > + case MPASSTHRU_SET_MEM_LOCKED: > + ret = copy_from_user(&limit, limitp, sizeof limit); > + if (ret < 0) > + return ret; > + > + mp = mp_get(mfile); > + if (!mp) > + return -ENODEV; > + > + limit = PAGE_ALIGN(limit) >> PAGE_SHIFT; > + down_write(¤t->mm->mmap_sem); So here, current might be different from mp->user: many processes might share an fd. The result will be that you will subtract locked_vm from A but add it to B. The right thing to do IMO is to store mm on SET_MEM_LOCKED. Also be careful about multiple callers etc. > + locked = limit + current->mm->locked_vm; > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + > + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { > + up_write(¤t->mm->mmap_sem); > + mp_put(mfile); > + return -ENOMEM; > + } > + current->mm->locked_vm = locked; > + up_write(¤t->mm->mmap_sem); > + > + mutex_lock(&mp_mutex); > + mp->ctor->locked_pages = limit; What if a process calls SET_MEM_LOCKED multiple times (or many processes do)? What if it is called when some pages are already locked? I suggested one way to handle that in one of the previous messages, but you must think out these scenarious when you invent a new ioctl. > + mutex_unlock(&mp_mutex); > + > + mp_put(mfile); > + return 0; > + > + case MPASSTHRU_GET_MEM_LOCKED_NEED: > + limit = DEFAULT_NEED; > + return copy_to_user(limitp, &limit, sizeof limit); > + > default: > break; > } > diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h > index ba8f320..083e9f7 100644 > --- a/include/linux/mpassthru.h > +++ b/include/linux/mpassthru.h > @@ -7,6 +7,8 @@ > /* ioctl defines */ > #define MPASSTHRU_BINDDEV _IOW('M', 213, int) > #define MPASSTHRU_UNBINDDEV _IO('M', 214) > +#define MPASSTHRU_SET_MEM_LOCKED _IOW('M', 215, unsigned long) > +#define MPASSTHRU_GET_MEM_LOCKED_NEED _IOR('M', 216, unsigned long) > > #ifdef __KERNEL__ > #if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE) > -- > 1.7.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html