On Thu, Oct 22, 2015 at 12:37:41AM +0800, Lan Tianyu wrote: > To let VF driver in the guest to know migration status, Qemu will > fake PCI configure reg 0xF0 and 0xF1 to show migrate status and > get ack from VF driver. I guess this works for current devices but not using 0xF0/0xF1 registers is not architectural, is it? So it could conflict with future devices. Maybe it's better to just have a dedicated para-virtualized device (PCI,ACPI,etc) for this migration-related activity. This driver would then register with this it. > When migration starts, Qemu will set reg "0xF0" to 1, notify > VF driver via triggering mail box msg and wait for VF driver to tell > it's ready for migration(set reg "0xF1" to 1). This waiting for driver is problematic: high load is one of the reasons people migrate VMs out. It would be much better if we could support migration while VM is completely stopped. > After migration, Qemu > will set reg "0xF0" to 0 and notify VF driver by mail box irq. VF > driver begins to restore tx/rx function after detecting sttatus change. > > When VF receives mail box irq, it will check reg "0xF0" in the service > task function to get migration status and performs related operations > according its value. > > Steps of restarting receive and transmit function > 1) Restore VF status in the PF driver via sending mail event to PF driver > 2) Write back reg values recorded by self emulation layer > 3) Restart rx/tx ring > 4) Recovery interrupt > > Transmit/Receive descriptor head regs are read-only and can't > be restored via writing back recording reg value directly and they > are set to 0 during VF reset. To reuse original tx/rx rings, shift > desc ring in order to move the desc pointed by original head reg to > first entry of the ring and then enable tx/rx rings. VF restarts to > receive and transmit from original head desc. > > Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx> > --- > drivers/net/ethernet/intel/ixgbevf/defines.h | 6 ++ > drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 7 +- > drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 115 ++++++++++++++++++++- > .../net/ethernet/intel/ixgbevf/self-emulation.c | 107 +++++++++++++++++++ > 4 files changed, 232 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h b/drivers/net/ethernet/intel/ixgbevf/defines.h > index 770e21a..113efd2 100644 > --- a/drivers/net/ethernet/intel/ixgbevf/defines.h > +++ b/drivers/net/ethernet/intel/ixgbevf/defines.h > @@ -239,6 +239,12 @@ struct ixgbe_adv_tx_context_desc { > __le32 mss_l4len_idx; > }; > > +union ixgbevf_desc { > + union ixgbe_adv_tx_desc rx_desc; > + union ixgbe_adv_rx_desc tx_desc; > + struct ixgbe_adv_tx_context_desc tx_context_desc; > +}; > + > /* Adv Transmit Descriptor Config Masks */ > #define IXGBE_ADVTXD_DTYP_MASK 0x00F00000 /* DTYP mask */ > #define IXGBE_ADVTXD_DTYP_CTXT 0x00200000 /* Advanced Context Desc */ > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h > index c823616..6eab402e 100644 > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h > @@ -109,7 +109,7 @@ struct ixgbevf_ring { > struct ixgbevf_ring *next; > struct net_device *netdev; > struct device *dev; > - void *desc; /* descriptor ring memory */ > + union ixgbevf_desc *desc; /* descriptor ring memory */ > dma_addr_t dma; /* phys. address of descriptor ring */ > unsigned int size; /* length in bytes */ > u16 count; /* amount of descriptors */ > @@ -493,6 +493,11 @@ extern void ixgbevf_write_eitr(struct ixgbevf_q_vector *q_vector); > > void ixgbe_napi_add_all(struct ixgbevf_adapter *adapter); > void ixgbe_napi_del_all(struct ixgbevf_adapter *adapter); > +int ixgbevf_tx_ring_shift(struct ixgbevf_ring *r, u32 head); > +int ixgbevf_rx_ring_shift(struct ixgbevf_ring *r, u32 head); > +void ixgbevf_restore_state(struct ixgbevf_adapter *adapter); > +inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter); > + > > #ifdef DEBUG > char *ixgbevf_get_hw_dev_name(struct ixgbe_hw *hw); > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c > index 056841c..15ec361 100644 > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c > @@ -91,6 +91,10 @@ MODULE_DESCRIPTION("Intel(R) 10 Gigabit Virtual Function Network Driver"); > MODULE_LICENSE("GPL"); > MODULE_VERSION(DRV_VERSION); > > + > +#define MIGRATION_COMPLETED 0x00 > +#define MIGRATION_IN_PROGRESS 0x01 > + > #define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV|NETIF_MSG_PROBE|NETIF_MSG_LINK) > static int debug = -1; > module_param(debug, int, 0); > @@ -221,6 +225,78 @@ static u64 ixgbevf_get_tx_completed(struct ixgbevf_ring *ring) > return ring->stats.packets; > } > > +int ixgbevf_tx_ring_shift(struct ixgbevf_ring *r, u32 head) > +{ > + struct ixgbevf_tx_buffer *tx_buffer = NULL; > + static union ixgbevf_desc *tx_desc = NULL; > + > + tx_buffer = vmalloc(sizeof(struct ixgbevf_tx_buffer) * (r->count)); > + if (!tx_buffer) > + return -ENOMEM; > + > + tx_desc = vmalloc(sizeof(union ixgbevf_desc) * r->count); > + if (!tx_desc) > + return -ENOMEM; > + > + memcpy(tx_desc, r->desc, sizeof(union ixgbevf_desc) * r->count); > + memcpy(r->desc, &tx_desc[head], sizeof(union ixgbevf_desc) * (r->count - head)); > + memcpy(&r->desc[r->count - head], tx_desc, sizeof(union ixgbevf_desc) * head); > + > + memcpy(tx_buffer, r->tx_buffer_info, sizeof(struct ixgbevf_tx_buffer) * r->count); > + memcpy(r->tx_buffer_info, &tx_buffer[head], sizeof(struct ixgbevf_tx_buffer) * (r->count - head)); > + memcpy(&r->tx_buffer_info[r->count - head], tx_buffer, sizeof(struct ixgbevf_tx_buffer) * head); > + > + if (r->next_to_clean >= head) > + r->next_to_clean -= head; > + else > + r->next_to_clean += (r->count - head); > + > + if (r->next_to_use >= head) > + r->next_to_use -= head; > + else > + r->next_to_use += (r->count - head); > + > + vfree(tx_buffer); > + vfree(tx_desc); > + return 0; > +} > + > +int ixgbevf_rx_ring_shift(struct ixgbevf_ring *r, u32 head) > +{ > + struct ixgbevf_rx_buffer *rx_buffer = NULL; > + static union ixgbevf_desc *rx_desc = NULL; > + > + rx_buffer = vmalloc(sizeof(struct ixgbevf_rx_buffer) * (r->count)); > + if (!rx_buffer) > + return -ENOMEM; > + > + rx_desc = vmalloc(sizeof(union ixgbevf_desc) * (r->count)); > + if (!rx_desc) > + return -ENOMEM; > + > + memcpy(rx_desc, r->desc, sizeof(union ixgbevf_desc) * (r->count)); > + memcpy(r->desc, &rx_desc[head], sizeof(union ixgbevf_desc) * (r->count - head)); > + memcpy(&r->desc[r->count - head], rx_desc, sizeof(union ixgbevf_desc) * head); > + > + memcpy(rx_buffer, r->rx_buffer_info, sizeof(struct ixgbevf_rx_buffer) * (r->count)); > + memcpy(r->rx_buffer_info, &rx_buffer[head], sizeof(struct ixgbevf_rx_buffer) * (r->count - head)); > + memcpy(&r->rx_buffer_info[r->count - head], rx_buffer, sizeof(struct ixgbevf_rx_buffer) * head); > + > + if (r->next_to_clean >= head) > + r->next_to_clean -= head; > + else > + r->next_to_clean += (r->count - head); > + > + if (r->next_to_use >= head) > + r->next_to_use -= head; > + else > + r->next_to_use += (r->count - head); > + > + vfree(rx_buffer); > + vfree(rx_desc); > + return 0; > +} > + > static u32 ixgbevf_get_tx_pending(struct ixgbevf_ring *ring) > { > struct ixgbevf_adapter *adapter = netdev_priv(ring->netdev); > @@ -1122,7 +1198,7 @@ static int ixgbevf_busy_poll_recv(struct napi_struct *napi) > * ixgbevf_configure_msix sets up the hardware to properly generate MSI-X > * interrupts. > **/ > -static void ixgbevf_configure_msix(struct ixgbevf_adapter *adapter) > +static void ixgbevf_configure_msix(struct ixgbevf_adapter *adapter) > { > struct ixgbevf_q_vector *q_vector; > int q_vectors, v_idx; > @@ -1534,7 +1610,7 @@ static inline void ixgbevf_irq_disable(struct ixgbevf_adapter *adapter) > * ixgbevf_irq_enable - Enable default interrupt generation settings > * @adapter: board private structure > **/ > -static inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) > +inline void ixgbevf_irq_enable(struct ixgbevf_adapter *adapter) > { > struct ixgbe_hw *hw = &adapter->hw; > > @@ -2901,6 +2977,36 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter) > ixgbevf_update_stats(adapter); > } > > +int ixgbevf_live_mg(struct ixgbevf_adapter *adapter) > +{ > + struct pci_dev *pdev = adapter->pdev; > + static int migration_status = MIGRATION_COMPLETED; > + u8 val; > + > + if (migration_status == MIGRATION_COMPLETED) { > + pci_read_config_byte(pdev, 0xf0, &val); > + if (!val) > + return 0; > + > + del_timer_sync(&adapter->service_timer); > + pr_info("migration start\n"); > + migration_status = MIGRATION_IN_PROGRESS; > + > + /* Tell Qemu VF is ready for migration. */ > + pci_write_config_byte(pdev, 0xf1, 0x1); > + return 1; > + } else { > + pci_read_config_byte(pdev, 0xf0, &val); > + if (val) > + return 1; > + > + ixgbevf_restore_state(adapter); > + migration_status = MIGRATION_COMPLETED; > + pr_info("migration end\n"); > + return 0; > + } > +} > + > /** > * ixgbevf_service_task - manages and runs subtasks > * @work: pointer to work_struct containing our data > @@ -2912,6 +3018,11 @@ static void ixgbevf_service_task(struct work_struct *work) > service_task); > struct ixgbe_hw *hw = &adapter->hw; > > + if (ixgbevf_live_mg(adapter)) { > + ixgbevf_service_event_complete(adapter); > + return; > + } > + > if (IXGBE_REMOVED(hw->hw_addr)) { > if (!test_bit(__IXGBEVF_DOWN, &adapter->state)) { > rtnl_lock(); > diff --git a/drivers/net/ethernet/intel/ixgbevf/self-emulation.c b/drivers/net/ethernet/intel/ixgbevf/self-emulation.c > index d74b2da..4476428 100644 > --- a/drivers/net/ethernet/intel/ixgbevf/self-emulation.c > +++ b/drivers/net/ethernet/intel/ixgbevf/self-emulation.c > @@ -9,6 +9,8 @@ > > static u32 hw_regs[0x4000]; > > +#define RESTORE_REG(hw, reg) IXGBE_WRITE_REG(hw, reg, hw_regs[reg]) > + > u32 ixgbe_self_emul_readl(volatile void __iomem *base, u32 addr) > { > u32 tmp; > @@ -24,3 +26,108 @@ void ixgbe_self_emul_writel(u32 val, volatile void __iomem *base, u32 addr) > hw_regs[(unsigned long)addr] = val; > writel(val, (volatile void __iomem *)(base + addr)); > } > + > +static u32 restore_regs[] = { > + IXGBE_VTIVAR(0), > + IXGBE_VTIVAR(1), > + IXGBE_VTIVAR(2), > + IXGBE_VTIVAR(3), > + IXGBE_VTIVAR_MISC, > + IXGBE_VTEITR(0), > + IXGBE_VTEITR(1), > + IXGBE_VFPSRTYPE, > +}; > + > +void ixgbevf_restore_state(struct ixgbevf_adapter *adapter) > +{ > + struct ixgbe_hw *hw = &adapter->hw; > + struct ixgbe_mbx_info *mbx = &hw->mbx; > + int i; > + u32 timeout = IXGBE_VF_INIT_TIMEOUT, rdh, tdh, rxdctl, txdctl; > + u32 wait_loop = 10; > + > + /* VF resetting */ > + IXGBE_WRITE_REG(hw, IXGBE_VFCTRL, IXGBE_CTRL_RST); > + IXGBE_WRITE_FLUSH(hw); > + > + while (!mbx->ops.check_for_rst(hw) && timeout) { > + timeout--; > + udelay(5); > + } > + if (!timeout) > + printk(KERN_ERR "[IXGBEVF] Unable to reset VF.\n"); > + > + /* Restoring VF status in the status */ > + hw->mac.ops.notify_resume(hw); > + > + /* Restoring regs value */ > + for (i = 0; i < sizeof(restore_regs)/sizeof(u32); i++) > + writel(hw_regs[restore_regs[i]], (volatile void *)(restore_regs[i] + hw->hw_addr)); > + > + /* Restoring rx ring */ > + for (i = 0; i < adapter->num_rx_queues; i++) { > + if (hw_regs[IXGBE_VFRXDCTL(i)] & IXGBE_RXDCTL_ENABLE) { > + RESTORE_REG(hw, IXGBE_VFRDBAL(i)); > + RESTORE_REG(hw, IXGBE_VFRDBAH(i)); > + RESTORE_REG(hw, IXGBE_VFRDLEN(i)); > + RESTORE_REG(hw, IXGBE_VFDCA_RXCTRL(i)); > + RESTORE_REG(hw, IXGBE_VFSRRCTL(i)); > + > + rdh = adapter->rx_ring[i]->next_to_clean; > + while (IXGBEVF_RX_DESC(adapter->rx_ring[i], rdh)->wb.upper.status_error > + & cpu_to_le32(IXGBE_RXD_STAT_DD)) > + rdh = (rdh + 1) % adapter->rx_ring[i]->count; > + > + ixgbevf_rx_ring_shift(adapter->rx_ring[i], rdh); > + > + wait_loop = 10; > + RESTORE_REG(hw, IXGBE_VFRXDCTL(i)); > + do { > + udelay(10); > + rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)); > + } while (--wait_loop && !(rxdctl & IXGBE_RXDCTL_ENABLE)); > + > + if (!wait_loop) > + pr_err("RXDCTL.ENABLE queue %d not cleared while polling\n", > + i); > + > + IXGBE_WRITE_REG(hw, IXGBE_VFRDT(i), adapter->rx_ring[i]->next_to_use); > + } > + } > + > + /* Restoring tx ring */ > + for (i = 0; i < adapter->num_tx_queues; i++) { > + if (hw_regs[IXGBE_VFTXDCTL(i)] & IXGBE_TXDCTL_ENABLE) { > + RESTORE_REG(hw, IXGBE_VFTDBAL(i)); > + RESTORE_REG(hw, IXGBE_VFTDBAH(i)); > + RESTORE_REG(hw, IXGBE_VFTDLEN(i)); > + RESTORE_REG(hw, IXGBE_VFDCA_TXCTRL(i)); > + > + tdh = adapter->tx_ring[i]->next_to_clean; > + while (IXGBEVF_TX_DESC(adapter->tx_ring[i], tdh)->wb.status > + & cpu_to_le32(IXGBE_TXD_STAT_DD)) > + tdh = (tdh + 1) % adapter->rx_ring[i]->count; > + ixgbevf_tx_ring_shift(adapter->tx_ring[i], tdh); > + > + wait_loop = 10; > + RESTORE_REG(hw, IXGBE_VFTXDCTL(i)); > + do { > + udelay(2000); > + txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i)); > + } while (--wait_loop && !(txdctl & IXGBE_TXDCTL_ENABLE)); > + > + if (!wait_loop) > + pr_err("Could not enable Tx Queue %d\n", i); > + > + IXGBE_WRITE_REG(hw, IXGBE_VFTDT(i), adapter->tx_ring[i]->next_to_use); > + } > + } > + > + /* Restore irq */ > + IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, hw_regs[IXGBE_VTEIMS] & 0x7); > + IXGBE_WRITE_REG(hw, IXGBE_VTEIMC, (~hw_regs[IXGBE_VTEIMS]) & 0x7); > + IXGBE_WRITE_REG(hw, IXGBE_VTEICS, hw_regs[IXGBE_VTEICS]); > + > + ixgbevf_irq_enable(adapter); > +} > + > -- > 1.8.4.rc0.1.g8f6a3e5.dirty > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html