Hi, > From: Vivek Goyal <vgoyal@xxxxxxxxxx> > Date: Tue, Sep 08, 2009 06:28:27PM -0400 > > > o I found an issue during test and that is if there is a mix of queue and group ... > So we need to keep track of process io queue's vdisktime, even it after got > deleted from io scheduler's service tree and use that same vdisktime if that > queue gets backlogged again. But trusting a ioq's vdisktime is bad because > it can lead to issues if a service tree min_vtime wrap around takes place > between two requests of the queue. (Agreed that it can be not that easy to > hit but it is possible). > > Hence, keep a cache of io queues serviced recently and when a queue gets > backlogged, if it is found in cache, use that vdisktime otherwise assign > a new vdisktime. This cache of io queues (idle tree), is basically the idea > implemented by BFQ guys. I had gotten rid of idle trees in V9 and now I am > bringing it back. (Now I understand it better. :-)). > > There is one good side affect of keeping the cache of recently service io > queues. Now CFQ can differentiate between streaming readers and new processes > doing IO. Now for a new queue (which is not in the cache), we can assign a > lower vdisktime and for a streaming reader, we assign vdisktime based on disk > time used. This way small file readers or the processes doing small amount > of IO will have reduced latencies at the cost of little reduced throughput of > streaming readers. > just a little note: this patch seems to introduce a special case for vdisktime = 0, assigning it the meaning of "bad timestamp," but the virtual time space wraps, so 0 is a perfectly legal value, which can be reached by service. I have no idea if it can produce visible effects, but it doesn't seem to be correct. > Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx> > --- > block/cfq-iosched.c | 2 > block/elevator-fq.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++---- > block/elevator-fq.h | 9 + > 3 files changed, 246 insertions(+), 17 deletions(-) > > Index: linux16/block/elevator-fq.c > =================================================================== > --- linux16.orig/block/elevator-fq.c 2009-09-08 15:44:21.000000000 -0400 > +++ linux16/block/elevator-fq.c 2009-09-08 15:47:45.000000000 -0400 > @@ -52,6 +52,8 @@ static struct kmem_cache *elv_ioq_pool; > #define elv_log_entity(entity, fmt, args...) > #endif > > +static void check_idle_tree_release(struct io_service_tree *st); > + > static inline struct io_queue *ioq_of(struct io_entity *entity) > { > if (entity->my_sd == NULL) > @@ -109,6 +111,11 @@ elv_prio_to_slice(struct elv_fq_data *ef > return elv_weight_slice(efqd, elv_ioq_sync(ioq), ioq->entity.weight); > } > > +static inline int vdisktime_gt(u64 a, u64 b) > +{ > + return (s64)(a - b) > 0; > +} > + > static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) > { > s64 delta = (s64)(vdisktime - min_vdisktime); > @@ -145,6 +152,7 @@ static void update_min_vdisktime(struct > } > > st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); > + check_idle_tree_release(st); > } > > static inline struct io_entity *parent_entity(struct io_entity *entity) > @@ -411,27 +419,46 @@ static void place_entity(struct io_servi > struct rb_node *parent; > struct io_entity *entry; > int nr_active = st->nr_active - 1; > + struct io_queue *ioq = ioq_of(entity); > + int sync = 1; > + > + if (ioq) > + sync = elv_ioq_sync(ioq); > + > + if (add_front || !nr_active) { > + vdisktime = st->min_vdisktime; > + goto done; > + } > + > + if (sync && entity->vdisktime > + && vdisktime_gt(entity->vdisktime, st->min_vdisktime)) { > + /* vdisktime still in future. Use old vdisktime */ > + vdisktime = entity->vdisktime; > + goto done; > + } > > /* > - * Currently put entity at the end of last entity. This probably will > - * require adjustments as we move along > + * Effectively a new queue. Assign sync queue a lower vdisktime so > + * we can achieve better latencies for small file readers. For async > + * queues, put them at the end of the existing queue. > + * Group entities are always considered sync. > */ > - if (io_entity_class_idle(entity)) { > - vdisktime = elv_delta_fair(ELV_IDLE_DELAY, entity); > - parent = rb_last(&st->active); > - if (parent) { > - entry = rb_entry(parent, struct io_entity, rb_node); > - vdisktime += entry->vdisktime; > - } > - } else if (!add_front && nr_active) { > - parent = rb_last(&st->active); > - if (parent) { > - entry = rb_entry(parent, struct io_entity, rb_node); > - vdisktime = entry->vdisktime; > - } > - } else > + if (sync) { > vdisktime = st->min_vdisktime; > + goto done; > + } > > + /* > + * Put entity at the end of the tree. Effectively async queues use > + * this path. > + */ > + parent = rb_last(&st->active); > + if (parent) { > + entry = rb_entry(parent, struct io_entity, rb_node); > + vdisktime = entry->vdisktime; > + } else > + vdisktime = st->min_vdisktime; > +done: > entity->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); > elv_log_entity(entity, "place_entity: vdisktime=%llu" > " min_vdisktime=%llu", entity->vdisktime, > @@ -447,6 +474,122 @@ static inline void io_entity_update_prio > */ > init_io_entity_service_tree(entity, parent_entity(entity)); > entity->ioprio_changed = 0; > + > + /* > + * Assign this entity a fresh vdisktime instead of using > + * previous one as prio class will lead to service tree > + * change and this vdisktime will not be valid on new > + * service tree. > + * > + * TODO: Handle the case of only prio change. > + */ > + entity->vdisktime = 0; > + } > +} > + > +static void > +__dequeue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity) > +{ > + if (st->rb_leftmost_idle == &entity->rb_node) { > + struct rb_node *next_node; > + > + next_node = rb_next(&entity->rb_node); > + st->rb_leftmost_idle = next_node; > + } > + > + rb_erase(&entity->rb_node, &st->idle); > + RB_CLEAR_NODE(&entity->rb_node); > +} > + > +static void dequeue_io_entity_idle(struct io_entity *entity) > +{ > + struct io_queue *ioq = ioq_of(entity); > + > + __dequeue_io_entity_idle(entity->st, entity); > + entity->on_idle_st = 0; > + if (ioq) > + elv_put_ioq(ioq); > +} > + > +static void > +__enqueue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity) > +{ > + struct rb_node **node = &st->idle.rb_node; > + struct rb_node *parent = NULL; > + struct io_entity *entry; > + int leftmost = 1; > + > + while (*node != NULL) { > + parent = *node; > + entry = rb_entry(parent, struct io_entity, rb_node); > + > + if (vdisktime_gt(entry->vdisktime, entity->vdisktime)) > + node = &parent->rb_left; > + else { > + node = &parent->rb_right; > + leftmost = 0; > + } > + } > + > + /* > + * Maintain a cache of leftmost tree entries (it is frequently > + * used) > + */ > + if (leftmost) > + st->rb_leftmost_idle = &entity->rb_node; > + > + rb_link_node(&entity->rb_node, parent, node); > + rb_insert_color(&entity->rb_node, &st->idle); > +} > + > +static void enqueue_io_entity_idle(struct io_entity *entity) > +{ > + struct io_queue *ioq = ioq_of(entity); > + struct io_group *parent_iog; > + > + /* > + * Don't put an entity on idle tree if it has been marked for deletion. > + * We are not expecting more io from this entity. No need to cache it > + */ > + > + if (entity->exiting) > + return; > + > + /* > + * If parent group is exiting, don't put on idle tree. May be task got > + * moved to a different cgroup and original cgroup got deleted > + */ > + parent_iog = iog_of(parent_entity(entity)); > + if (parent_iog->entity.exiting) > + return; > + > + if (ioq) > + elv_get_ioq(ioq); > + __enqueue_io_entity_idle(entity->st, entity); > + entity->on_idle_st = 1; > +} > + > +static void check_idle_tree_release(struct io_service_tree *st) > +{ > + struct io_entity *leftmost; > + > + if (!st->rb_leftmost_idle) > + return; > + > + leftmost = rb_entry(st->rb_leftmost_idle, struct io_entity, rb_node); > + > + if (vdisktime_gt(st->min_vdisktime, leftmost->vdisktime)) > + dequeue_io_entity_idle(leftmost); > +} > + > +static void flush_idle_tree(struct io_service_tree *st) > +{ > + struct io_entity *entity; > + > + while(st->rb_leftmost_idle) { > + entity = rb_entry(st->rb_leftmost_idle, struct io_entity, > + rb_node); > + dequeue_io_entity_idle(entity); > } > } > > @@ -483,6 +626,9 @@ static void dequeue_io_entity(struct io_ > st->nr_active--; > sd->nr_active--; > debug_update_stats_dequeue(entity); > + > + if (vdisktime_gt(entity->vdisktime, st->min_vdisktime)) > + enqueue_io_entity_idle(entity); > } > > static void > @@ -524,6 +670,16 @@ static void enqueue_io_entity(struct io_ > struct io_service_tree *st; > struct io_sched_data *sd = io_entity_sched_data(entity); > > + if (entity->on_idle_st) > + dequeue_io_entity_idle(entity); > + else > + /* > + * This entity was not in idle tree cache. Zero out vdisktime > + * so that we don't rely on old vdisktime instead assign a > + * fresh one. > + */ > + entity->vdisktime = 0; > + > io_entity_update_prio(entity); > st = entity->st; > st->nr_active++; > @@ -574,6 +730,8 @@ static void requeue_io_entity(struct io_ > struct io_service_tree *st = entity->st; > struct io_entity *next_entity; > > + entity->vdisktime = 0; > + > if (add_front) { > next_entity = __lookup_next_io_entity(st); > > @@ -1937,11 +2095,18 @@ static void io_free_root_group(struct el > { > struct io_group *iog = e->efqd->root_group; > struct io_cgroup *iocg = &io_root_cgroup; > + struct io_service_tree *st; > + int i; > > spin_lock_irq(&iocg->lock); > hlist_del_rcu(&iog->group_node); > spin_unlock_irq(&iocg->lock); > > + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { > + st = iog->sched_data.service_tree + i; > + flush_idle_tree(st); > + } > + > put_io_group_queues(e, iog); > elv_put_iog(iog); > } > @@ -2039,9 +2204,29 @@ EXPORT_SYMBOL(elv_put_iog); > */ > static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog) > { > + struct io_service_tree *st; > + int i; > + struct io_entity *entity = &iog->entity; > + > + /* > + * Mark io group for deletion so that no new entry goes in > + * idle tree. Any active queue which is removed from active > + * tree will not be put in to idle tree. > + */ > + entity->exiting = 1; > + > + /* We flush idle tree now, and don't put things in there any more. */ > + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { > + st = iog->sched_data.service_tree + i; > + flush_idle_tree(st); > + } > + > hlist_del(&iog->elv_data_node); > put_io_group_queues(efqd->eq, iog); > > + if (entity->on_idle_st) > + dequeue_io_entity_idle(entity); > + > /* > * Put the reference taken at the time of creation so that when all > * queues are gone, group can be destroyed. > @@ -2374,7 +2559,13 @@ static struct io_group *io_alloc_root_gr > static void io_free_root_group(struct elevator_queue *e) > { > struct io_group *iog = e->efqd->root_group; > + struct io_service_tree *st; > + int i; > > + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { > + st = iog->sched_data.service_tree + i; > + flush_idle_tree(st); > + } > put_io_group_queues(e, iog); > kfree(iog); > } > @@ -3257,6 +3448,35 @@ done: > elv_schedule_dispatch(q); > } > > +/* > + * The process associted with ioq (in case of cfq), is going away. Mark it > + * for deletion. > + */ > +void elv_exit_ioq(struct io_queue *ioq) > +{ > + struct io_entity *entity = &ioq->entity; > + > + /* > + * Async ioq's belong to io group and are cleaned up once group is > + * being deleted. Not need to do any cleanup here even if cfq has > + * dropped the reference to the queue > + */ > + if (!elv_ioq_sync(ioq)) > + return; > + > + /* > + * This queue is still under service. Just mark it so that once all > + * the IO from queue is done, it is not put back in idle tree. > + */ > + if (entity->on_st) { > + entity->exiting = 1; > + return; > + } else if(entity->on_idle_st) { > + /* Remove ioq from idle tree */ > + dequeue_io_entity_idle(entity); > + } > +} > +EXPORT_SYMBOL(elv_exit_ioq); > static void elv_slab_kill(void) > { > /* > Index: linux16/block/cfq-iosched.c > =================================================================== > --- linux16.orig/block/cfq-iosched.c 2009-09-08 15:43:36.000000000 -0400 > +++ linux16/block/cfq-iosched.c 2009-09-08 15:47:45.000000000 -0400 > @@ -1138,6 +1138,7 @@ static void cfq_exit_cfqq(struct cfq_dat > elv_schedule_dispatch(cfqd->queue); > } > > + elv_exit_ioq(cfqq->ioq); > cfq_put_queue(cfqq); > } > > @@ -1373,6 +1374,7 @@ static void changed_cgroup(struct io_con > */ > if (iog != __iog) { > cic_set_cfqq(cic, NULL, 1); > + elv_exit_ioq(sync_cfqq->ioq); > cfq_put_queue(sync_cfqq); > } > } > Index: linux16/block/elevator-fq.h > =================================================================== > --- linux16.orig/block/elevator-fq.h 2009-09-08 15:43:36.000000000 -0400 > +++ linux16/block/elevator-fq.h 2009-09-08 15:47:45.000000000 -0400 > @@ -33,6 +33,10 @@ struct io_service_tree { > u64 min_vdisktime; > struct rb_node *rb_leftmost; > unsigned int nr_active; > + > + /* A cache of io entities which were served and expired */ > + struct rb_root idle; > + struct rb_node *rb_leftmost_idle; > }; > > struct io_sched_data { > @@ -44,9 +48,12 @@ struct io_sched_data { > struct io_entity { > struct rb_node rb_node; > int on_st; > + int on_idle_st; > u64 vdisktime; > unsigned int weight; > struct io_entity *parent; > + /* This io entity (queue or group) has been marked for deletion */ > + unsigned int exiting; > > struct io_sched_data *my_sd; > struct io_service_tree *st; > @@ -572,7 +579,7 @@ extern struct io_queue *elv_alloc_ioq(st > extern void elv_free_ioq(struct io_queue *ioq); > extern struct io_group *ioq_to_io_group(struct io_queue *ioq); > extern int elv_iog_should_idle(struct io_queue *ioq); > - > +extern void elv_exit_ioq(struct io_queue *ioq); > #else /* CONFIG_ELV_FAIR_QUEUING */ > static inline struct elv_fq_data * > elv_alloc_fq_data(struct request_queue *q, struct elevator_queue *e) _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers