hi all: I think I got it. The new kernel just remove the NIC from poll list first, If we should repoll it, kernel will add it again. The codes is: 4560 static void net_rx_action(struct softirq_action *h) 4561 { 4562 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4563 unsigned long time_limit = jiffies + 2; 4564 int budget = netdev_budget; 4565 LIST_HEAD(list); 4566 LIST_HEAD(repoll); 4567 void *have; 4568 4569 local_irq_disable(); 4570 list_splice_init(&sd->poll_list, &list); 4571 local_irq_enable(); 4572 4573 while (!list_empty(&list)) { 4574 struct napi_struct *n; 4575 int work, weight; 4576 4577 /* If softirq window is exhausted then punt. 4578 * Allow this to run for 2 jiffies since which will allow 4579 * an average latency of 1.5/HZ. 4580 */ 4581 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) 4582 goto softnet_break; 4583 4584 4585 n = list_first_entry(&list, struct napi_struct, poll_list); 4586 list_del_init(&n->poll_list); 4587 4588 have = netpoll_poll_lock(n); 4589 4590 weight = n->weight; 4591 4592 /* This NAPI_STATE_SCHED test is for avoiding a race 4593 * with netpoll's poll_napi(). Only the entity which 4594 * obtains the lock and sees NAPI_STATE_SCHED set will 4595 * actually make the ->poll() call. Therefore we avoid 4596 * accidentally calling ->poll() when NAPI is not scheduled. 4597 */ 4598 work = 0; 4599 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 4600 work = n->poll(n, weight); 4601 trace_napi_poll(n); 4602 } 4603 4604 WARN_ON_ONCE(work > weight); 4605 4606 budget -= work; 4607 4608 /* Drivers must not modify the NAPI state if they 4609 * consume the entire weight. In such cases this code 4610 * still "owns" the NAPI instance and therefore can 4611 * move the instance around on the list at-will. 4612 */ 4613 if (unlikely(work == weight)) { 4614 if (unlikely(napi_disable_pending(n))) { 4615 napi_complete(n); 4616 } else { 4617 if (n->gro_list) { 4618 /* flush too old packets 4619 * If HZ < 1000, flush all packets. 4620 */ 4621 napi_gro_flush(n, HZ >= 1000); 4622 } 4623 list_add_tail(&n->poll_list, &repoll); 4624 } 4625 } 4626 4627 netpoll_poll_unlock(have); 4628 } 4629 4630 if (!sd_has_rps_ipi_waiting(sd) && 4631 list_empty(&list) && 4632 list_empty(&repoll)) 4633 return; 4634 out: 4635 local_irq_disable(); 4636 4637 list_splice_tail_init(&sd->poll_list, &list); 4638 list_splice_tail(&repoll, &list); 4639 list_splice(&list, &sd->poll_list); 4640 if (!list_empty(&sd->poll_list)) 4641 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4642 4643 net_rps_action_and_irq_enable(sd); 4644 4645 return; 4646 4647 softnet_break: 4648 sd->time_squeeze++; 4649 goto out; 4650 } line 4585: choose a NIC which will poll. line 4586: remove the NIC from the poll list. line 4623: add the NIC again in repoll list if we nedd poll it. line 4638: splice the poll list and repoll list. So, I think this is not a bug. @valdis.kletnieks I think we should remove NIC form poll list after the list of sd->input_pkt_queue is empty, so I think code should run like this: 4350 rps_lock(sd); 4351 if (skb_queue_empty(&sd->input_pkt_queue)) { 4352 /* 4353 * Inline a custom version of __napi_complete(). 4354 * only current cpu owns and manipulates this napi, 4355 * and NAPI_STATE_SCHED is the only possible flag set 4356 * on backlog. 4357 * We can use a plain write instead of clear_bit(), 4358 * and we dont need an smp_mb() memory barrier. 4359 */ list_del(&napi->poll_list); // I add it. 4360 napi->state = 0; 4361 rps_unlock(sd); 4362 4363 break; 4364 } Thank you 在 2014/12/23 20:13, Vignesh Radhakrishnan 写道: > We do delete it. I am not very thorough with the code , but here is what > i think is happening. > > process_backlog is set as the backlog polling function for softnet data > sd->backlog.poll = process_backlog; > > Now net_rx_action which runs in soft irq context, calls the polling as > per the following code and the deletion as well. > > > Notice where we are deleting it : > > n = list_first_entry(&list, struct napi_struct, poll_list); > list_del_init(&n->poll_list); > --------------------------------------> Here is the deletion. > > have = netpoll_poll_lock(n); > > weight = n->weight; > > /* This NAPI_STATE_SCHED test is for avoiding a race > * with netpoll's poll_napi(). Only the entity which > * obtains the lock and sees NAPI_STATE_SCHED set will > * actually make the ->poll() call. Therefore we avoid > * accidentally calling ->poll() when NAPI is not > scheduled. > */ > work = 0; > if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); > -------------------------------------> Process_backlog is called here > trace_napi_poll(n); > } > > Earlier we had deletion in process_backlog itself, but > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d75b1ade567ffab085e8adbbdacf0092d10cd09c > has changed it. > > Thanks and regards, > Vignesh Radhakrishnan > > On Tue, Dec 23, 2014 at 2:29 PM, lx <lxlenovostar@xxxxxxxxx > <mailto:lxlenovostar@xxxxxxxxx>> wrote: > > hi all: > I read the function of process_backlog, and I found it don't > delete the NIC form poll list. the codes is: > > 4321 static int process_backlog(struct napi_struct *napi, int quota) > 4322 { > 4323 int work = 0; > 4324 struct softnet_data *sd = container_of(napi, struct > softnet_data, backlog); > 4325 > 4326 /* Check if we have pending ipi, its better to send them now, > 4327 * not waiting net_rx_action() end. > 4328 */ > 4329 if (sd_has_rps_ipi_waiting(sd)) { > 4330 local_irq_disable(); > 4331 net_rps_action_and_irq_enable(sd); > 4332 } > 4333 > 4334 napi->weight = weight_p; > 4335 local_irq_disable(); > 4336 while (1) { > 4337 struct sk_buff *skb; > 4338 > 4339 while ((skb = __skb_dequeue(&sd->process_queue))) { > 4340 local_irq_enable(); > 4341 __netif_receive_skb(skb); > 4342 local_irq_disable(); > 4343 input_queue_head_incr(sd); > 4344 if (++work >= quota) { > 4345 local_irq_enable(); > 4346 return work; > 4347 } > 4348 } > 4349 > 4350 rps_lock(sd); > 4351 if (skb_queue_empty(&sd->input_pkt_queue)) { > 4352 /* > 4353 * Inline a custom version of __napi_complete(). > 4354 * only current cpu owns and manipulates this napi, > 4355 * and NAPI_STATE_SCHED is the only possible flag set > 4356 * on backlog. > 4357 * We can use a plain write instead of clear_bit(), > 4358 * and we dont need an smp_mb() memory barrier. > 4359 */ > 4360 napi->state = 0; > 4361 rps_unlock(sd); > 4362 > 4363 break; > 4364 } > 4365 > 4366 skb_queue_splice_tail_init(&sd->input_pkt_queue, > 4367 &sd->process_queue); > 4368 rps_unlock(sd); > 4369 } > 4370 local_irq_enable(); > 4371 > 4372 return work; > 4373 } > > I think we should delete the NIC from the poll list by: > list_del(&napi->poll_list); > > and my git repository infromation is: > > [root@localhost linux]# git show > commit aa39477b5692611b91ac9455ae588738852b3f60 > Merge: 48ec833 5164bec > Author: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx > <mailto:torvalds@xxxxxxxxxxxxxxxxxxxx>> > Date: Mon Dec 22 14:47:17 2014 -0800 > > Merge tag 'dm-3.19-fixes' of > git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm > <http://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm> > Pull device mapper fixes from Mike Snitzer: > "Thre stable fixes and one fix for a regression introduced > during 3.19 > merge: > - Fix inability to discard used space when the thin-pool > target is in > out-of-data-space mode and also transition the thin-pool > back to > write mode once free space is made available. > - Fix DM core bio-based end_io bug that prevented proper > post-processing of the error code returned from the block > layer. > - Fix crash in DM thin-pool due to thin device being added > to the > pool's active_thins list before properly initializing the thin > device's refcount" > * tag 'dm-3.19-fixes' of > git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm > <http://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm>: > dm: fix missed error code if .end_io isn't implemented by > target_type > dm thin: fix crash by initializing thin device's refcount and > completion earlier > dm thin: fix missing out-of-data-space to write mode > transition if blocks are released > dm thin: fix inability to discard blocks when in > out-of-data-space mode > > > Don't handle the list_del is right, or this is a bug? > > Thank you. > > _______________________________________________ > Kernelnewbies mailing list > Kernelnewbies@xxxxxxxxxxxxxxxxx <mailto:Kernelnewbies@xxxxxxxxxxxxxxxxx> > http://lists.kernelnewbies.org/mailman/listinfo/kernelnewbies > > > > > -- > http://vigneshradhakrishnan.blogspot.com/ _______________________________________________ Kernelnewbies mailing list Kernelnewbies@xxxxxxxxxxxxxxxxx http://lists.kernelnewbies.org/mailman/listinfo/kernelnewbies