This patch is proposal for protection of fcports field of struct scsi_qla_host, which is currently unprotected. The problem. At the moment, in qla2xxx driver list fcports, which is part of struct scsi_qla_host, is unprotected by any way. Basically, only one function can add to this list new entries: qla2x00_configure_local_loop() (another one, qla2x00_configure_fabric(), can be called only from qla2x00_configure_local_loop()), no entries never deleted from this list (except on shutdown) and many functions can go through this list using list_for_each_entry(). The problem is that all traversing over fcports functions can be called not only from the DPC thread, where qla2x00_configure_local_loop() adds new entries, but also from many other threads. For example, qla2x00_get_starget_node_name() can be called by any thread via sysfs. As it is known, simultaneous traversing over a list together with adding a new member to it is racy and can lead to a crash. The proposed solution. This patch proposes a fix for this issue by protecting fcports list using RCU list access functions. It is possible, because adding to the list is done from the only DPC thread, so it's selfprotected. But the main purpose of this patch is to bring attention to this issue as well as to possibility of existence of similar issues in other places of qla2xxx driver. Particularly, this patch doesn't address the following 2 issues: - It wasn't investigated if there are possible bad consequences if some function, traveling over fcports list, missed just added to it new entry. - It wasn't investigated if it is possible if new entries can be added to fcports list by qla2x00_configure_local_loop() called not from DPC thread. According to the attached call tree made by KScope, qla2x00_configure_local_loop() also can be called from eh_host_reset_handler(). But I'm not sure, if there isn't any protection somewhere against that. Also I wasn't deeply investigated if other fields of struct scsi_qla_host similarly incorrectly unprotected. (BTW, IMHO the whole idea of never deleting entries from fcports list isn't quite good. Sure, it is impossible to have thousands of dead entries, but with virtual ports having hundreds of them is quite possible. I'm not sure if it's good to loose that memory.) This patch is against 2.6.26 Signed-off-by: Vladislav Bolkhovitin <vst@xxxxxxxx> diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_attr.c linux-2.6.26/drivers/scsi/qla2xxx/qla_attr.c --- linux-2.6.26/drivers/scsi/qla2xxx/qla_attr.c 2008-07-14 01:51:29.000000000 +0400 +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_attr.c 2008-07-24 10:08:12.000000000 +0400 @@ -921,7 +921,7 @@ qla2x00_get_starget_node_name(struct scs fc_port_t *fcport; u64 node_name = 0; - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { if (fcport->rport && starget->id == fcport->rport->scsi_target_id) { node_name = wwn_to_u64(fcport->node_name); @@ -940,7 +940,7 @@ qla2x00_get_starget_port_name(struct scs fc_port_t *fcport; u64 port_name = 0; - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { if (fcport->rport && starget->id == fcport->rport->scsi_target_id) { port_name = wwn_to_u64(fcport->port_name); @@ -959,7 +959,7 @@ qla2x00_get_starget_port_id(struct scsi_ fc_port_t *fcport; uint32_t port_id = ~0U; - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { if (fcport->rport && starget->id == fcport->rport->scsi_target_id) { port_id = fcport->d_id.b.domain << 16 | diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h --- linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h 2008-07-14 01:51:29.000000000 +0400 +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_def.h 2008-07-24 10:07:39.000000000 +0400 @@ -2403,7 +2403,14 @@ typedef struct scsi_qla_host { struct list_head work_list; - /* Fibre Channel Device List. */ + /* + * Fibre Channel Device List. + * + * This list can be accessed from many contexts, including concurrently. + * So, although anything never deleted from this list, it needs the + * corresponding protection anyway. Let's do it by making access to + * it using RCU functions. + */ struct list_head fcports; /* RSCN queue. */ diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_init.c linux-2.6.26/drivers/scsi/qla2xxx/qla_init.c --- linux-2.6.26/drivers/scsi/qla2xxx/qla_init.c 2008-07-14 01:51:29.000000000 +0400 +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_init.c 2008-07-24 10:11:56.000000000 +0400 @@ -2071,7 +2071,7 @@ qla2x00_configure_local_loop(scsi_qla_ho /* * Mark local devices that were present with FCF_DEVICE_LOST for now. */ - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx != ha->vp_idx) continue; @@ -2136,7 +2136,7 @@ qla2x00_configure_local_loop(scsi_qla_ho /* Check for matching device in port list. */ found = 0; fcport = NULL; - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx != ha->vp_idx) continue; @@ -2165,7 +2165,7 @@ qla2x00_configure_local_loop(scsi_qla_ho list_add_tail(&new_fcport->vp_fcport, &ha->vp_fcports); } - list_add_tail(&new_fcport->list, &pha->fcports); + list_add_tail_rcu(&new_fcport->list, &pha->fcports); /* Allocate a new replacement fcport. */ fcport = new_fcport; @@ -2405,7 +2405,7 @@ qla2x00_configure_fabric(scsi_qla_host_t * Logout all previous fabric devices marked lost, except * tape devices. */ - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx !=ha->vp_idx) continue; @@ -2439,7 +2439,7 @@ qla2x00_configure_fabric(scsi_qla_host_t * Scan through our port list and login entries that need to be * logged in. */ - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx != ha->vp_idx) continue; @@ -2494,10 +2494,13 @@ qla2x00_configure_fabric(scsi_qla_host_t fcport->vp_idx = ha->vp_idx; list_add_tail(&fcport->vp_fcport, &ha->vp_fcports); - list_move_tail(&fcport->list, + list_del(&fcport->list); + list_add_tail_rcu(&fcport->list, &ha->parent->fcports); - } else - list_move_tail(&fcport->list, &ha->fcports); + } else { + list_del(&fcport->list); + list_add_tail_rcu(&fcport->list, &ha->fcports); + } } } while (0); @@ -2680,7 +2683,7 @@ qla2x00_find_all_fabric_devs(scsi_qla_ho /* Locate matching device in database. */ found = 0; - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (new_fcport->vp_idx != fcport->vp_idx) continue; if (memcmp(new_fcport->port_name, fcport->port_name, @@ -2810,7 +2813,7 @@ qla2x00_find_new_loop_id(scsi_qla_host_t /* Check for loop ID being already in use. */ found = 0; fcport = NULL; - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->loop_id == dev->loop_id && fcport != dev) { /* ID possibly in use */ found++; @@ -2924,7 +2927,7 @@ qla2x00_device_resync(scsi_qla_host_t *h rval = QLA_SUCCESS; - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx != ha->vp_idx) continue; @@ -3219,7 +3222,7 @@ qla2x00_update_fcports(scsi_qla_host_t * fc_port_t *fcport; /* Go with deferred removal of rport references. */ - list_for_each_entry(fcport, &ha->fcports, list) + list_for_each_entry_rcu(fcport, &ha->fcports, list) if (fcport->drport) qla2x00_rport_del(fcport); } diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_mid.c linux-2.6.26/drivers/scsi/qla2xxx/qla_mid.c --- linux-2.6.26/drivers/scsi/qla2xxx/qla_mid.c 2008-07-14 01:51:29.000000000 +0400 +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_mid.c 2008-07-24 10:12:02.000000000 +0400 @@ -95,7 +95,7 @@ qla2x00_mark_vp_devices_dead(scsi_qla_ho fc_port_t *fcport; scsi_qla_host_t *pha = to_qla_parent(vha); - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (fcport->vp_idx != vha->vp_idx) continue; diff -upr linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c --- linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c 2008-07-14 01:51:29.000000000 +0400 +++ linux-2.6.26/drivers/scsi/qla2xxx/qla_os.c 2008-07-24 10:13:45.000000000 +0400 @@ -1009,7 +1009,7 @@ qla2x00_loop_reset(scsi_qla_host_t *ha) } if (ha->flags.enable_target_reset) { - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { if (fcport->port_type != FCT_TARGET) continue; @@ -1915,7 +1915,7 @@ qla2x00_mark_all_devices_lost(scsi_qla_h fc_port_t *fcport; scsi_qla_host_t *pha = to_qla_parent(ha); - list_for_each_entry(fcport, &pha->fcports, list) { + list_for_each_entry_rcu(fcport, &pha->fcports, list) { if (ha->vp_idx != 0 && ha->vp_idx != fcport->vp_idx) continue; /* @@ -2348,7 +2348,7 @@ qla2x00_do_dpc(void *data) ha->host_no)); next_loopid = 0; - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { /* * If the port is not ONLINE then try to login * to it if we haven't run out of retries. @@ -2524,7 +2524,7 @@ qla2x00_timer(scsi_qla_host_t *ha) * the port it marked DEAD. */ t = 0; - list_for_each_entry(fcport, &ha->fcports, list) { + list_for_each_entry_rcu(fcport, &ha->fcports, list) { if (fcport->port_type != FCT_TARGET) continue;