This patch introduces new structure fuse_numa_node, which groups some fields from fuse_conn structure. An instance of fuse_numa_node is created per each numa node that exists on the system. This is to reduce contention on single spinlock which creates latencies when accessesed across NUMA regions. Signed-off-by: Srinivas Eeda <srinivas.eeda@xxxxxxxxxx> --- fs/fuse/control.c | 25 ++++++++---- fs/fuse/cuse.c | 11 ++++- fs/fuse/fuse_i.h | 118 +++++++++++++++++++++++++++++++---------------------- fs/fuse/inode.c | 114 +++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 182 insertions(+), 86 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855..9a9ca5c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -48,12 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, size_t size; if (!*ppos) { - long value; + long i, value; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; - value = atomic_read(&fc->num_waiting); + for (i = 0, value = 0; i < fc->nr_nodes; i++) + value += atomic_read(&fc->nn[i]->num_waiting); file->private_data = (void *)value; fuse_conn_put(fc); } @@ -101,13 +102,14 @@ static ssize_t fuse_conn_max_background_read(struct file *file, loff_t *ppos) { struct fuse_conn *fc; - unsigned val; + unsigned i, val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; - val = fc->max_background; + for (i = 0, val = 0; i < fc->nr_nodes; i++) + val += fc->nn[i]->max_background; fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -123,9 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_bgreq); if (ret > 0) { + int i; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fc->max_background = val; + val = (val + fc->nr_nodes - 1) / fc->nr_nodes; + for (i = 0; i < fc->nr_nodes; i++) + fc->nn[i]->max_background = val; fuse_conn_put(fc); } } @@ -138,13 +143,14 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file, loff_t *ppos) { struct fuse_conn *fc; - unsigned val; + unsigned i, val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; - val = fc->congestion_threshold; + for (i = 0, val = 0; i < fc->nr_nodes; i++) + val += fc->nn[i]->congestion_threshold; fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -160,9 +166,12 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); if (ret > 0) { + int i; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fc->congestion_threshold = val; + val = (val + fc->nr_nodes - 1) / fc->nr_nodes; + for (i = 0; i < fc->nr_nodes; i++) + fc->nn[i]->congestion_threshold = val; fuse_conn_put(fc); } } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index de10bdf..90d99d4 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -498,13 +498,14 @@ static int cuse_channel_open(struct inode *inode, struct file *file) if (!cc) return -ENOMEM; - fuse_conn_init(&cc->fc, 0); + rc = fuse_conn_init(&cc->fc, 0); + if (rc < 0) + return rc; INIT_LIST_HEAD(&cc->list); cc->fc.release = cuse_fc_release; cc->fc.connected = 1; - cc->fc.blocked = 0; rc = cuse_send_init(cc); if (rc) { fuse_conn_put(&cc->fc); @@ -562,8 +563,12 @@ static ssize_t cuse_class_waiting_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cuse_conn *cc = dev_get_drvdata(dev); + int i, val; + + for (i = 0, val = 0; i < cc->fc.nr_nodes; i++) + val += atomic_read(&cc->fc.nn[i]->num_waiting); - return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); + return sprintf(buf, "%d\n", val); } static ssize_t cuse_class_abort_store(struct device *dev, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index dd9a7ad..b44675b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -232,6 +232,9 @@ enum fuse_req_state { * A request to the client */ struct fuse_req { + /* numa node number on which fuse_req is allocated from */ + int numaid; + /** This can be on either pending processing or io lists in fuse_conn */ struct list_head list; @@ -342,6 +345,66 @@ struct fuse_req { struct file *stolen_file; }; +/* structure that tracks numa node specific fields */ +struct fuse_numa_node { + /* numa node id */ + int numaid; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* pointer to main fuse_connection */ + struct fuse_conn *fc; + + /* Flag indicating if queue is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /* Maximum number of outstanding background requests */ + unsigned max_background; + + /* Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /* Number of requests currently in the background */ + unsigned num_background; + + /* Number of background requests currently queued for userspace */ + unsigned active_background; + + /* The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /* waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /* Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /* The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /* Pending interrupts */ + struct list_head interrupts; + + /* The list of pending requests */ + struct list_head pending; + + /* The list of requests being processed */ + struct list_head processing; + + /* The list of requests under I/O */ + struct list_head io; +}; + /** * A Fuse connection. * @@ -356,6 +419,9 @@ struct fuse_conn { /** tracks if numa enabled */ int numa_on; + /** Number of numa nodes */ + int nr_nodes; + /** Mutex protecting against directory alias creation */ struct mutex inst_mutex; @@ -377,57 +443,12 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Readers of the connection are waiting on this */ - wait_queue_head_t waitq; - - /** The list of pending requests */ - struct list_head pending; - - /** The list of requests being processed */ - struct list_head processing; - - /** The list of requests under I/O */ - struct list_head io; - /** The next unique kernel file handle */ u64 khctr; /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; - /** Maximum number of outstanding background requests */ - unsigned max_background; - - /** Number of background requests at which congestion starts */ - unsigned congestion_threshold; - - /** Number of requests currently in the background */ - unsigned num_background; - - /** Number of background requests currently queued for userspace */ - unsigned active_background; - - /** The list of background requests set aside for later queuing */ - struct list_head bg_queue; - - /** Pending interrupts */ - struct list_head interrupts; - - /** Queue of pending forgets */ - struct fuse_forget_link forget_list_head; - struct fuse_forget_link *forget_list_tail; - - /** Batching of FORGET requests (positive indicates FORGET batch) */ - int forget_batch; - - /** Flag indicating if connection is blocked. This will be - the case before the INIT reply is received, and if there - are too many outstading backgrounds requests */ - int blocked; - - /** waitq for blocked connection */ - wait_queue_head_t blocked_waitq; - /** waitq for reserved requests */ wait_queue_head_t reserved_req_waitq; @@ -523,9 +544,6 @@ struct fuse_conn { /** Does the filesystem want adaptive readdirplus? */ unsigned readdirplus_auto:1; - /** The number of requests waiting for completion */ - atomic_t num_waiting; - /** Negotiated minor version */ unsigned minor; @@ -564,6 +582,8 @@ struct fuse_conn { /** Read/write semaphore to hold when accessing sb. */ struct rw_semaphore killsb; + + struct fuse_numa_node **nn; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -766,7 +786,7 @@ void fuse_conn_kill(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, int numaon); +int fuse_conn_init(struct fuse_conn *fc, int numaon); /** * Release reference to fuse_conn diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1837f74..250eb38 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -360,14 +360,21 @@ static void fuse_bdi_destroy(struct fuse_conn *fc) void fuse_conn_kill(struct fuse_conn *fc) { + int i; + struct fuse_numa_node *nn; + spin_lock(&fc->lock); fc->connected = 0; - fc->blocked = 0; spin_unlock(&fc->lock); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); - wake_up_all(&fc->waitq); - wake_up_all(&fc->blocked_waitq); + for (i = 0; i < fc->nr_nodes; i++) { + nn = fc->nn[i]; + nn->blocked = 0; + wake_up_all(&nn->waitq); + wake_up_all(&nn->blocked_waitq); + } + wake_up_all(&fc->poll_waitq); wake_up_all(&fc->reserved_req_waitq); } EXPORT_SYMBOL_GPL(fuse_conn_kill); @@ -567,8 +574,11 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } -void fuse_conn_init(struct fuse_conn *fc, int numaon) +int fuse_conn_init(struct fuse_conn *fc, int numaon) { + int i, sz, ret; + struct fuse_numa_node *nn; + memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); mutex_init(&fc->inst_mutex); @@ -576,25 +586,61 @@ void fuse_conn_init(struct fuse_conn *fc, int numaon) atomic_set(&fc->count, 1); if (numaon) fc->numa_on = 1; - init_waitqueue_head(&fc->waitq); - init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); - INIT_LIST_HEAD(&fc->bg_queue); + init_waitqueue_head(&fc->poll_waitq); INIT_LIST_HEAD(&fc->entry); - fc->forget_list_tail = &fc->forget_list_head; - atomic_set(&fc->num_waiting, 0); - fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; - fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; fc->reqctr = 0; - fc->blocked = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + + if (numaon) { + fc->numa_on = 1; + fc->nr_nodes = nr_node_ids; + } else + fc->nr_nodes = 1; + + ret = -ENOMEM; + sz = sizeof(struct fuse_numa_node *) * fc->nr_nodes; + fc->nn = kmalloc(sz, GFP_KERNEL); + if (!fc->nn) + return ret; + memset(fc->nn, 0, sz); + + sz = sizeof(struct fuse_numa_node); + for (i = 0; i < fc->nr_nodes; i++) { + nn = kmalloc_node(sz, GFP_KERNEL, i); + if (!nn) + goto out; + memset(nn, 0, sz); + fc->nn[i] = nn; + nn->fc = fc; + nn->numaid = i; + nn->blocked = 1; + spin_lock_init(&nn->lock); + init_waitqueue_head(&nn->waitq); + init_waitqueue_head(&nn->blocked_waitq); + INIT_LIST_HEAD(&nn->bg_queue); + INIT_LIST_HEAD(&nn->interrupts); + INIT_LIST_HEAD(&nn->pending); + INIT_LIST_HEAD(&nn->processing); + INIT_LIST_HEAD(&nn->io); + nn->forget_list_tail = &nn->forget_list_head; + atomic_set(&nn->num_waiting, 0); + nn->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + nn->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; + } + return 0; +out: + while (i > 0) { + if (fc->nn[i - 1]) + kfree(fc->nn[i - 1]); + i--; + }; + if (fc->nn) + kfree(fc->nn); + return ret; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -816,6 +862,7 @@ static int set_global_limit(const char *val, struct kernel_param *kp) static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) { int cap_sys_admin = capable(CAP_SYS_ADMIN); + int i, val; if (arg->minor < 13) return; @@ -824,22 +871,29 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) sanitize_global_limit(&max_user_congthresh); if (arg->max_background) { - fc->max_background = arg->max_background; + val = arg->max_background; + if (!cap_sys_admin && (val > max_user_bgreq)) + val = max_user_bgreq; + + val = (val + fc->nr_nodes - 1) / fc->nr_nodes; + for (i = 0; i < fc->nr_nodes; i++) + fc->nn[i]->max_background = val; - if (!cap_sys_admin && fc->max_background > max_user_bgreq) - fc->max_background = max_user_bgreq; } if (arg->congestion_threshold) { - fc->congestion_threshold = arg->congestion_threshold; + val = arg->congestion_threshold; + if (!cap_sys_admin && val > max_user_congthresh) + val = max_user_congthresh; - if (!cap_sys_admin && - fc->congestion_threshold > max_user_congthresh) - fc->congestion_threshold = max_user_congthresh; + val = (val + fc->nr_nodes - 1) / fc->nr_nodes; + for (i = 0; i < fc->nr_nodes; i++) + fc->nn[i]->congestion_threshold = val; } } static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { + int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -891,8 +945,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fc->blocked = 0; - wake_up_all(&fc->blocked_waitq); + for (i = 0; i < fc->nr_nodes; i++) { + fc->nn[i]->blocked = 0; + wake_up_all(&fc->nn[i]->blocked_waitq); + } } static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) @@ -924,6 +980,11 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { + int i; + + for (i = 0; i < fc->nr_nodes; i++) + if (fc->nn[i]) + kfree(fc->nn[i]); kfree(fc); } @@ -1019,7 +1080,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!fc) goto err_fput; - fuse_conn_init(fc, d.numaon); + if (fuse_conn_init(fc, d.numaon) < 0) + goto err_fput; fc->dev = sb->s_dev; fc->sb = sb; -- 1.5.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html