From: Karen Xie <kxie@xxxxxxxxxxx> New cxgb3i iscsi driver. The driver interfaces with cxgb3 driver to access the hardware. Signed-off-by: Karen Xie <kxie@xxxxxxxxxxx> --- drivers/scsi/Kconfig | 2 drivers/scsi/Makefile | 1 drivers/scsi/cxgb3i/Kconfig | 6 drivers/scsi/cxgb3i/Makefile | 5 drivers/scsi/cxgb3i/cxgb3i.h | 190 ++ drivers/scsi/cxgb3i/cxgb3i_init.c | 107 + drivers/scsi/cxgb3i/cxgb3i_iscsi.c | 797 ++++++++++ drivers/scsi/cxgb3i/cxgb3i_offload.c | 2808 ++++++++++++++++++++++++++++++++++ drivers/scsi/cxgb3i/cxgb3i_offload.h | 259 +++ drivers/scsi/cxgb3i/cxgb3i_ulp2.c | 722 +++++++++ drivers/scsi/cxgb3i/cxgb3i_ulp2.h | 102 + security/security.c | 1 12 files changed, 5000 insertions(+), 0 deletions(-) create mode 100644 drivers/scsi/cxgb3i/Kconfig create mode 100644 drivers/scsi/cxgb3i/Makefile create mode 100644 drivers/scsi/cxgb3i/cxgb3i.h create mode 100644 drivers/scsi/cxgb3i/cxgb3i_init.c create mode 100644 drivers/scsi/cxgb3i/cxgb3i_iscsi.c create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.c create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.h create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.c create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.h diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 22070e9..5ae06a8 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -1759,6 +1759,8 @@ config ZFCP source "drivers/scsi/bnx2i/Kconfig" +source "drivers/scsi/cxgb3i/Kconfig" + config SCSI_SRP tristate "SCSI RDMA Protocol helper library" depends on SCSI && PCI diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index a3f6866..b830af3 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_SCSI_STEX) += stex.o obj-$(CONFIG_SCSI_MVSAS) += mvsas.o obj-$(CONFIG_PS3_ROM) += ps3rom.o obj-$(CONFIG_SCSI_BNX2_ISCSI) += bnx2i/ +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i/ obj-$(CONFIG_ARM) += arm/ diff --git a/drivers/scsi/cxgb3i/Kconfig b/drivers/scsi/cxgb3i/Kconfig new file mode 100644 index 0000000..2762814 --- /dev/null +++ b/drivers/scsi/cxgb3i/Kconfig @@ -0,0 +1,6 @@ +config SCSI_CXGB3_ISCSI + tristate "Chelsio S3xx iSCSI support" + select CHELSIO_T3 + select SCSI_ISCSI_ATTRS + ---help--- + This driver supports iSCSI offload for the Chelsio S3 series devices. diff --git a/drivers/scsi/cxgb3i/Makefile b/drivers/scsi/cxgb3i/Makefile new file mode 100644 index 0000000..8c8a894 --- /dev/null +++ b/drivers/scsi/cxgb3i/Makefile @@ -0,0 +1,5 @@ +EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3 + +cxgb3i-y := cxgb3i_init.o cxgb3i_iscsi.o cxgb3i_ulp2.o cxgb3i_offload.o + +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o diff --git a/drivers/scsi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgb3i/cxgb3i.h new file mode 100644 index 0000000..39a3b94 --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i.h @@ -0,0 +1,190 @@ +/* + * cxgb3i.h: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@xxxxxxxxxxx) + */ + +#ifndef __CXGB3I_H__ +#define __CXGB3I_H__ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/in.h> +#include <linux/kfifo.h> +#include <linux/netdevice.h> +#include <linux/completion.h> +#include <linux/scatterlist.h> + +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi_eh.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi.h> +#include <scsi/iscsi_proto.h> +#include <scsi/libiscsi.h> +#include <scsi/scsi_transport_iscsi.h> +#include <linux/crypto.h> +#include "../iscsi_tcp.h" + +/* from cxgb3 LLD */ +#include "common.h" +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_ctl_defs.h" +#include "cxgb3_offload.h" +#include "firmware_exports.h" +#include "cxgb3i_offload.h" + +/** + * message + */ +#define cxgb3i_log_error(fmt...) printk(KERN_ERR "cxgb3i: ERR! " fmt) +#define cxgb3i_log_warn(fmt...) printk(KERN_WARNING "cxgb3i: WARN! " fmt) +#define cxgb3i_log_info(fmt...) printk(KERN_INFO "cxgb3i: " fmt) + +#ifdef __DEBUG_CXGB3I__ +#define cxgb3i_log_debug(fmt, args...) \ + printk(KERN_ERR "cxgb3i: %s - " fmt, __func__ , ## args) +#else +#define cxgb3i_log_debug(fmt...) +#endif + +#define CXGB3I_SCSI_QDEPTH_DFLT 128 + +struct cxgb3i_adapter; +struct cxgb3i_hba; +struct cxgb3i_endpoint; + +/** + * struct cxgb3i_tag_format - cxgb3i ulp tag for steering pdu payload + * + * @rsvd_bits: # of bits used by h/w + * @rsvd_shift: shift left + * @rsvd_mask: bit mask + * + */ +struct cxgb3i_tag_format { + unsigned char idx_bits; + unsigned char age_bits; + unsigned char rsvd_bits; + unsigned char rsvd_shift; + u32 rsvd_mask; +}; + +/** + * struct cxgb3i_ddp_info - cxgb3i direct data placement for pdu payload + * + * @llimit: lower bound of the page pod memory + * @ulimit: upper bound of the page pod memory + * @nppods: # of page pod entries + * @idx_last: page pod entry last used + * @map_lock: lock to synchonize access to the page pod map + * @map: page pod map + */ +struct cxgb3i_ddp_info { + unsigned int llimit; + unsigned int ulimit; + unsigned int nppods; + unsigned int idx_last; + spinlock_t map_lock; + u8 *map; +}; + +struct cxgb3i_hba { + struct cxgb3i_adapter *snic; + struct net_device *ndev; + struct Scsi_Host *shost; + + rwlock_t cconn_rwlock; + struct list_head cconn_list; +}; + +struct cxgb3i_adapter { + struct list_head list_head; + spinlock_t lock; + struct t3cdev *tdev; + struct pci_dev *pdev; + unsigned char hba_cnt; + struct cxgb3i_hba *hba[MAX_NPORTS]; + + unsigned int tx_max_size; + unsigned int rx_max_size; + + struct cxgb3i_tag_format tag_format; + struct cxgb3i_ddp_info ddp; +}; + +struct cxgb3i_conn { + struct list_head list_head; + + struct cxgb3i_endpoint *cep; + struct iscsi_conn *conn; + struct cxgb3i_hba *hba; +}; + +struct cxgb3i_endpoint { + struct socket *sock; + struct cxgb3i_hba *hba; + struct cxgb3i_conn *cconn; +}; + +int cxgb3i_iscsi_init(void); +void cxgb3i_iscsi_cleanup(void); + +struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *); +struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *); +void cxgb3i_adapter_remove(struct cxgb3i_adapter *); +int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *); +void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *); + +struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *); +struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *, + struct net_device *); +void cxgb3i_hba_host_remove(struct cxgb3i_hba *); + +void cxgb3i_hba_conn_add(struct cxgb3i_conn *, struct cxgb3i_hba *); +void cxgb3i_hba_conn_remove(struct cxgb3i_conn *); + +int cxgb3i_ulp2_init(void); +void cxgb3i_ulp2_cleanup(void); +int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *, int, int); + +void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *, u32, + struct scatterlist *, unsigned int); +u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *, unsigned int, + u32, unsigned int, struct scatterlist *, + unsigned int); +static inline void cxgb3i_parse_tag(struct cxgb3i_tag_format *format, + u32 tag, u32 *rsvd_bits, u32 *sw_bits) +{ + if (rsvd_bits) + *rsvd_bits = (tag >> format->rsvd_shift) & format->rsvd_mask; + if (sw_bits) { + *sw_bits = (tag >> (format->rsvd_shift + format->rsvd_bits)) + << format->rsvd_shift; + *sw_bits |= tag & ((1 << format->rsvd_shift) - 1); + } +} + +void cxgb3i_sk_set_callbacks(struct sock *, struct iscsi_conn *); +void cxgb3i_sk_restore_callbacks(struct sock *, struct iscsi_conn *); + +int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *); + +void cxgb3i_display_byte_string(char *, unsigned char *, int, int); + +#endif diff --git a/drivers/scsi/cxgb3i/cxgb3i_init.c b/drivers/scsi/cxgb3i/cxgb3i_init.c new file mode 100644 index 0000000..b848e4c --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_init.c @@ -0,0 +1,107 @@ +/* cxgb3i_init.c: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@xxxxxxxxxxx) + */ + +#include "cxgb3i.h" + +#define DRV_MODULE_NAME "cxgb3i" +#define DRV_MODULE_VERSION "1.0.0" +#define DRV_MODULE_RELDATE "May 1, 2008" + +static char version[] __devinitdata = + "Chelsio S3xx iSCSI Driver " DRV_MODULE_NAME + " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; + +MODULE_AUTHOR("Karen Xie <kxie@xxxxxxxxxxx>"); +MODULE_DESCRIPTION("Chelsio S3xx iSCSI Driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_VERSION); + +static void open_s3_dev(struct t3cdev *); +static void close_s3_dev(struct t3cdev *); +cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS]; +struct cxgb3_client t3c_client = { + .name = "iscsi_cxgb3", + .handlers = cxgb3i_cpl_handlers, + .add = open_s3_dev, + .remove = close_s3_dev, +}; + +/** + * open_s3_dev - register with cxgb3 LLD + * @t3dev cxgb3 adapter instance + */ +static void open_s3_dev(struct t3cdev *t3dev) +{ + static int vers_printed; + + if (!vers_printed) + printk(KERN_INFO "%s", version); + + cxgb3i_log_debug("open cxgb3 %s.\n", t3dev->name); + + cxgb3i_tcp_add(t3dev, &t3c_client); + cxgb3i_adapter_add(t3dev); +} + +/** + * close_s3_dev - de-register with cxgb3 LLD + * @t3dev cxgb3 adapter instance + */ +static void close_s3_dev(struct t3cdev *t3dev) +{ + struct cxgb3i_adapter *snic = cxgb3i_adapter_find_by_tdev(t3dev); + cxgb3i_log_debug("close cxgb3 %s.\n", t3dev->name); + if (snic) + cxgb3i_adapter_remove(snic); + cxgb3i_tcp_remove(t3dev); +} + +/** + * cxgb3i_init_module - module init entry point + * + * initialize any driver wide global data structures and register itself + * with the cxgb3 module + */ +static int __init cxgb3i_init_module(void) +{ + int err; + + err = cxgb3i_tcp_init(cxgb3i_cpl_handlers); + if (err < 0) + return err; + + err = cxgb3i_iscsi_init(); + if (err < 0) + return err; + + err = cxgb3i_ulp2_init(); + if (err < 0) + return err; + + cxgb3_register_client(&t3c_client); + return 0; +} + +/** + * cxgb3i_exit_module - module cleanup/exit entry point + * + * go through the driver hba list and for each hba, release any resource held. + * and unregisters iscsi transport and the cxgb3 module + */ +static void __exit cxgb3i_exit_module(void) +{ + cxgb3_unregister_client(&t3c_client); + cxgb3i_ulp2_cleanup(); + cxgb3i_iscsi_cleanup(); +} + +module_init(cxgb3i_init_module); +module_exit(cxgb3i_exit_module); diff --git a/drivers/scsi/cxgb3i/cxgb3i_iscsi.c b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c new file mode 100644 index 0000000..ea7e21f --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c @@ -0,0 +1,797 @@ +/* cxgb3i_iscsi.c: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@xxxxxxxxxxx) + */ + +#include <net/tcp.h> +#include "cxgb3i.h" + +static struct scsi_transport_template *cxgb3i_scsi_transport; +static struct scsi_host_template cxgb3i_host_template; +static struct iscsi_transport cxgb3i_iscsi_transport; + +static LIST_HEAD(cxgb3i_snic_list); +static DEFINE_RWLOCK(cxgb3i_snic_rwlock); + +/** + * cxgb3i_adapter_add - initialize a s3 adapter structure and any h/w settings + * necessary + * @snic: pointer to adapter instance + */ +struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *t3dev) +{ + struct cxgb3i_adapter *snic; + struct adapter *adapter = tdev2adap(t3dev); + int i; + + snic = kzalloc(sizeof(*snic), GFP_KERNEL); + if (!snic) { + cxgb3i_log_debug("cxgb3 %s, OOM.\n", t3dev->name); + return NULL; + } + + spin_lock_init(&snic->lock); + snic->tdev = t3dev; + snic->pdev = adapter->pdev; + + if (cxgb3i_adapter_ulp_init(snic)) + goto free_snic; + + for_each_port(adapter, i) { + snic->hba[i] = cxgb3i_hba_host_add(snic, adapter->port[i]); + if (!snic->hba[i]) + goto ulp_cleanup; + } + snic->hba_cnt = adapter->params.nports; + + /* add to the list */ + write_lock(&cxgb3i_snic_rwlock); + list_add_tail(&snic->list_head, &cxgb3i_snic_list); + write_unlock(&cxgb3i_snic_rwlock); + + return snic; + +ulp_cleanup: + cxgb3i_adapter_ulp_cleanup(snic); +free_snic: + kfree(snic); + return NULL; +} + +/** + * cxgb3i_snic_cleanup - release all the resources held and cleanup any h/w + * settings necessary + * @snic: pointer to adapter instance + */ +void cxgb3i_adapter_remove(struct cxgb3i_adapter *snic) +{ + int i; + + /* remove from the list */ + write_lock(&cxgb3i_snic_rwlock); + list_del(&snic->list_head); + write_unlock(&cxgb3i_snic_rwlock); + + for (i = 0; i < snic->hba_cnt; i++) { + if (snic->hba[i]) { + cxgb3i_hba_host_remove(snic->hba[i]); + snic->hba[i] = NULL; + } + } + + /* release ddp resources */ + cxgb3i_adapter_ulp_cleanup(snic); + kfree(snic); +} + +struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *t3dev) +{ + struct cxgb3i_adapter *snic; + + read_lock(&cxgb3i_snic_rwlock); + list_for_each_entry(snic, &cxgb3i_snic_list, list_head) { + if (snic->tdev == t3dev) { + read_unlock(&cxgb3i_snic_rwlock); + return snic; + } + } + read_unlock(&cxgb3i_snic_rwlock); + + return NULL; +} + +struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *ndev) +{ + struct cxgb3i_adapter *snic; + int i; + + read_lock(&cxgb3i_snic_rwlock); + list_for_each_entry(snic, &cxgb3i_snic_list, list_head) { + for (i = 0; i < snic->hba_cnt; i++) { + if (snic->hba[i]->ndev == ndev) { + read_unlock(&cxgb3i_snic_rwlock); + return (snic->hba[i]); + } + } + } + read_unlock(&cxgb3i_snic_rwlock); + return NULL; +} + +void cxgb3i_hba_conn_add(struct cxgb3i_conn *cconn, struct cxgb3i_hba *hba) +{ + cconn->hba = hba; + write_lock(&hba->cconn_rwlock); + list_add_tail(&cconn->list_head, &hba->cconn_list); + write_unlock(&hba->cconn_rwlock); +} + +void cxgb3i_hba_conn_remove(struct cxgb3i_conn *cconn) +{ + struct cxgb3i_hba *hba = cconn->hba; + + if (hba) { + write_lock(&hba->cconn_rwlock); + list_del(&cconn->list_head); + write_unlock(&hba->cconn_rwlock); + } +} + +struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *snic, + struct net_device *ndev) +{ + struct cxgb3i_hba *hba; + struct Scsi_Host *shost; + int err; + + shost = iscsi_host_alloc(&cxgb3i_host_template, + sizeof(struct cxgb3i_hba), + CXGB3I_SCSI_QDEPTH_DFLT); + if (!shost) { + cxgb3i_log_info("iscsi_host_alloc failed.\n"); + return NULL; + } + + shost->transportt = cxgb3i_scsi_transport; + shost->max_lun = 512; + shost->max_id = 0; + shost->max_channel = 0; + shost->max_cmd_len = 16; + + hba = iscsi_host_priv(shost); + INIT_LIST_HEAD(&hba->cconn_list); + rwlock_init(&hba->cconn_rwlock); + hba->snic = snic; + hba->ndev = ndev; + hba->shost = shost; + + pci_dev_get(snic->pdev); + err = iscsi_host_add(shost, &snic->pdev->dev); + if (err) { + cxgb3i_log_info("iscsi_host_add failed.\n"); + goto pci_dev_put; + } + + cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n", + shost, hba, shost->host_no); + + return hba; + +pci_dev_put: + pci_dev_put(snic->pdev); + scsi_host_put(shost); + return NULL; +} + +void cxgb3i_hba_host_remove(struct cxgb3i_hba *hba) +{ + if (hba->shost) { + cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n", + hba->shost, hba, hba->shost->host_no); + iscsi_host_remove(hba->shost); + pci_dev_put(hba->snic->pdev); + /* cleanup connections ? */ + iscsi_host_free(hba->shost); + } +} + +/** + * cxgb3i_ep_connect - establish TCP connection to target portal + * @dst_addr: target IP address + * @non_blocking: blocking or non-blocking call + * + * Initiates a TCP/IP connection to the dst_addr + */ +static struct iscsi_endpoint *cxgb3i_ep_connect(struct sockaddr *dst_addr, + int non_blocking) +{ + struct iscsi_endpoint *ep; + struct cxgb3i_endpoint *cep; + struct cxgb3i_hba *hba; + struct socket *sock; + struct sock *sk; + struct tcp_sock *tp; + int err; + + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) + return NULL; + sk = sock->sk; + + sk->sk_allocation = GFP_ATOMIC; + sk->sk_reuse = 1; + tp = tcp_sk(sk); + tp->nonagle |= TCP_NAGLE_OFF; + err = cxgb3i_tcp_connect(sock, dst_addr, sizeof(struct sockaddr), + ULP_MODE_ISCSI); + if (err < 0) { + cxgb3i_log_info("sock 0x%p, connect failed %d.\n", sock, err); + goto release_sock; + } + if (!c3cn_flag(sock->sk, C3CN_OFFLOADED)) { + cxgb3i_log_info("sock 0x%p, NOT offloaded.\n", sock); + goto release_sock; + } + if (C3CN_ULP_MODE(sk) != ULP_MODE_ISCSI) { + cxgb3i_log_info("sock 0x%p, mode 0x%x, NOT expected.\n", + sock, C3CN_ULP_MODE(sk)); + goto release_sock; + } + hba = cxgb3i_hba_find_by_netdev(__sk_dst_get(sk)->dev); + if (!hba) { + cxgb3i_log_info("NOT going through cxgbi device.\n"); + goto release_sock; + } + + ep = iscsi_create_endpoint(sizeof(*cep)); + if (!ep) { + cxgb3i_log_info("iscsi alloc ep, OOM.\n"); + goto release_sock; + } + cep = ep->dd_data; + cep->sock = sock; + cep->hba = hba; + + cxgb3i_log_debug("sock 0x%p, iscsi_ep 0x%p, cxgb_ep 0x%p, hba 0x%p.\n", + sock, ep, cep, hba); + return ep; + +release_sock: + sock_release(sock); + return NULL; +} + +/** + * cxgb3i_ep_poll - polls for TCP connection establishement + * @ep: TCP connection (endpoint) handle + * @timeout_ms: timeout value in milli secs + * + * polls for TCP connect request to complete + */ +static int cxgb3i_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) +{ + return 1; +} + +/** + * cxgb3i_ep_disconnect - teardown TCP connection + * @ep: TCP connection (endpoint) handle + * + * teardown TCP connection + */ +static void cxgb3i_ep_disconnect(struct iscsi_endpoint *ep) +{ + struct cxgb3i_endpoint *cep = (struct cxgb3i_endpoint *)ep->dd_data; + struct cxgb3i_conn *cconn = cep->cconn; + + cxgb3i_log_debug("ep 0x%p, cep 0x%p.\n", ep, cep); + + if (cconn && cconn->conn) { + struct iscsi_conn *conn = cconn->conn; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + cxgb3i_sk_restore_callbacks(cep->sock->sk, conn); + write_lock_bh(&cep->sock->sk->sk_callback_lock); + set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); + cconn->cep = NULL; + tcp_conn->sock = NULL; + write_unlock_bh(&cep->sock->sk->sk_callback_lock); + } + + sock_release(cep->sock); + iscsi_destroy_endpoint(ep); +} + +/** + * cxgb3i_session_create - create a new iscsi session + * @cmds_max: max # of commands + * @qdepth: scsi queue depth + * @initial_cmdsn: initial iscsi CMDSN for this session + * @host_no: pointer to return host no + * + * Creates a new iSCSI session + */ +static struct iscsi_cls_session *cxgb3i_session_create(struct iscsi_endpoint + *ep, uint16_t cmds_max, + uint16_t qdepth, + uint32_t initial_cmdsn, + uint32_t *host_no) +{ + struct cxgb3i_endpoint *cep; + struct cxgb3i_hba *hba; + struct Scsi_Host *shost; + struct iscsi_cls_session *cls_session; + struct iscsi_session *session; + int i; + + if (!ep) { + cxgb3i_log_error("%s, missing endpoint.\n", __func__); + return NULL; + } + + cep = (struct cxgb3i_endpoint *)ep->dd_data; + hba = cep->hba; + shost = hba->shost; + cxgb3i_log_debug("ep 0x%p, cep 0x%p, hba 0x%p.\n", ep, cep, hba); + BUG_ON(hba != iscsi_host_priv(shost)); + + *host_no = shost->host_no; + + cls_session = iscsi_session_setup(&cxgb3i_iscsi_transport, shost, + cmds_max, + sizeof(struct iscsi_tcp_task), + initial_cmdsn, ISCSI_MAX_TARGET); + if (!cls_session) + return NULL; + + session = cls_session->dd_data; + + for (i = 0; i < session->cmds_max; i++) { + struct iscsi_task *task = session->cmds[i]; + struct iscsi_tcp_task *tcp_task = task->dd_data; + + task->hdr = &tcp_task->hdr.cmd_hdr; + task->hdr_max = sizeof(tcp_task->hdr) - ISCSI_DIGEST_SIZE; + } + + if (iscsi_r2tpool_alloc(session)) + goto remove_session; + + return cls_session; + +remove_session: + iscsi_session_teardown(cls_session); + return NULL; +} + +/** + * cxgb3i_session_destroy - destroys iscsi session + * @cls_session: pointer to iscsi cls session + * + * Destroys an iSCSI session instance and releases its all resources held + */ +static void cxgb3i_session_destroy(struct iscsi_cls_session *cls_session) +{ + cxgb3i_log_debug("sess 0x%p.\n", cls_session); + iscsi_r2tpool_free(cls_session->dd_data); + iscsi_session_teardown(cls_session); +} + +/** + * cxgb3i_conn_create - create iscsi connection instance + * @cls_session: pointer to iscsi cls session + * @cid: iscsi cid + * + * Creates a new iSCSI connection instance for a given session + */ +static struct iscsi_cls_conn *cxgb3i_conn_create(struct iscsi_cls_session + *cls_session, uint32_t cid) +{ + struct iscsi_cls_conn *cls_conn; + struct iscsi_conn *conn; + struct iscsi_tcp_conn *tcp_conn; + struct cxgb3i_conn *cconn; + + cxgb3i_log_debug("sess 0x%p, cid %u.\n", cls_session, cid); + + cls_conn = iscsi_conn_setup(cls_session, + sizeof(*tcp_conn) + sizeof(*cconn), cid); + if (!cls_conn) + return NULL; + conn = cls_conn->dd_data; + + conn->max_recv_dlength = ISCSI_DEF_MAX_RECV_SEG_LEN; + + tcp_conn = conn->dd_data; + tcp_conn->iscsi_conn = conn; + + cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + cconn->conn = conn; + + return cls_conn; +} + +/** + * cxgb3i_conn_bind - binds iscsi sess, conn and endpoint together + * @cls_session: pointer to iscsi cls session + * @cls_conn: pointer to iscsi cls conn + * @transport_eph: 64-bit EP handle + * @is_leading: leading connection on this session? + * + * Binds together an iSCSI session, an iSCSI connection and a + * TCP connection. This routine returns error code if the TCP + * connection does not belong on the device iSCSI sess/conn is bound + */ + +static int cxgb3i_conn_bind(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, + uint64_t transport_eph, int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + struct iscsi_endpoint *ep; + struct cxgb3i_endpoint *cep; + struct socket *sock; + int err; + + ep = iscsi_lookup_endpoint(transport_eph); + if (!ep) + return -EINVAL; + + cxgb3i_log_debug("ep 0x%p, cls sess 0x%p, cls conn 0x%p.\n", + ep, cls_session, cls_conn); + + err = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (err) + return -EINVAL; + + cep = (struct cxgb3i_endpoint *)ep->dd_data; + sock = cep->sock; + + tcp_conn->sock = sock; + cconn->hba = cep->hba; + cconn->cep = cep; + cep->cconn = cconn; + + spin_lock_bh(&conn->session->lock); + sprintf(conn->portal_address, NIPQUAD_FMT, + NIPQUAD(inet_sk(sock->sk)->daddr)); + conn->portal_port = ntohs(inet_sk(sock->sk)->dport); + spin_unlock_bh(&conn->session->lock); + + cxgb3i_sk_set_callbacks(sock->sk, conn); + iscsi_tcp_hdr_recv_prep(tcp_conn); + + return 0; +} + +/** + * cxgb3i_conn_flush - flush tx + * @conn: pointer to iscsi conn + */ +static int cxgb3i_conn_flush(struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct iscsi_segment *segment = &tcp_conn->out.segment; + + if (segment->total_copied < segment->total_size) + return cxgb3i_conn_ulp2_xmit(conn); + return 0; +} + +/** + * cxgb3i_conn_get_param - return iscsi connection parameter to caller + * @cls_conn: pointer to iscsi cls conn + * @param: parameter type identifier + * @buf: buffer pointer + * + * returns iSCSI connection parameters + */ +static int cxgb3i_conn_get_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + int len; + + cxgb3i_log_debug("cls_conn 0x%p, param %d.\n", cls_conn, param); + + switch (param) { + case ISCSI_PARAM_CONN_PORT: + spin_lock_bh(&conn->session->lock); + len = sprintf(buf, "%hu\n", conn->portal_port); + spin_unlock_bh(&conn->session->lock); + break; + case ISCSI_PARAM_CONN_ADDRESS: + spin_lock_bh(&conn->session->lock); + len = sprintf(buf, "%s\n", conn->portal_address); + spin_unlock_bh(&conn->session->lock); + break; + default: + return iscsi_conn_get_param(cls_conn, param, buf); + } + + return len; +} + +static int cxgb3i_conn_set_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf, int buflen) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_session *session = conn->session; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + int value, err = 0; + + switch (param) { + case ISCSI_PARAM_HDRDGST_EN: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err && conn->hdrdgst_en) + cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en, + conn->datadgst_en); + break; + case ISCSI_PARAM_DATADGST_EN: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err && conn->datadgst_en) + cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en, + conn->datadgst_en); + break; + case ISCSI_PARAM_MAX_R2T: + sscanf(buf, "%d", &value); + if (value <= 0 || !is_power_of_2(value)) + return -EINVAL; + if (session->max_r2t == value) + break; + iscsi_r2tpool_free(session); + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err && iscsi_r2tpool_alloc(session)) + return -ENOMEM; + case ISCSI_PARAM_MAX_RECV_DLENGTH: + err = iscsi_set_param(cls_conn, param, buf, buflen); + cxgb3i_log_debug("MAX_RECV %u.\n", conn->max_recv_dlength); + break; + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + err = iscsi_set_param(cls_conn, param, buf, buflen); + cxgb3i_log_debug("MAX_XMIT %u.\n", conn->max_xmit_dlength); + break; + default: + return iscsi_set_param(cls_conn, param, buf, buflen); + } + return err; +} + +/** + * cxgb3i_host_get_param - returns host (adapter) related parameters + * @shost: scsi host pointer + * @param: parameter type identifier + * @buf: buffer pointer + */ +static int cxgb3i_host_get_param(struct Scsi_Host *shost, + enum iscsi_host_param param, char *buf) +{ + struct cxgb3i_hba *hba = iscsi_host_priv(shost); + int i; + int len = 0; + + switch (param) { + case ISCSI_HOST_PARAM_HWADDRESS: + for (i = 0; i < 6; i++) + len += + sprintf(buf + len, "%02x.", + hba->ndev->dev_addr[i]); + len--; + buf[len] = '\0'; + break; + case ISCSI_HOST_PARAM_NETDEV_NAME: + len = sprintf(buf, "%s\n", hba->ndev->name); + break; + default: + return iscsi_host_get_param(shost, param, buf); + } + return len; +} + +/** + * cxgb3i_conn_get_stats - returns iSCSI stats + * @cls_conn: pointer to iscsi cls conn + * @stats: pointer to iscsi statistic struct + */ +static void cxgb3i_conn_get_stats(struct iscsi_cls_conn *cls_conn, + struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; + stats->r2t_pdus = conn->r2t_pdus_cnt; + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->digest_err = 0; + stats->timeout_err = 0; + stats->custom_length = 1; + strcpy(stats->custom[0].desc, "eh_abort_cnt"); + stats->custom[0].value = conn->eh_abort_cnt; +} + +static inline u32 tag_base(struct cxgb3i_tag_format *format, + unsigned int idx, unsigned int age) +{ + u32 sw_bits = idx | (age << format->idx_bits); + u32 tag = sw_bits >> format->rsvd_shift; + tag <<= format->rsvd_bits + format->rsvd_shift; + tag |= sw_bits & ((1 << format->rsvd_shift) - 1); + return tag; +} + +static void cxgb3i_parse_itt(struct iscsi_conn *conn, itt_t itt, + int *idx, int *age) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + struct cxgb3i_adapter *snic = cconn->hba->snic; + u32 sw_bits; + + cxgb3i_parse_tag(&snic->tag_format, itt, NULL, &sw_bits); + if (idx) + *idx = sw_bits & ISCSI_ITT_MASK; + if (age) + *age = (sw_bits >> snic->tag_format.idx_bits) & ISCSI_AGE_MASK; +} + +static int cxgb3i_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt) +{ + struct scsi_cmnd *sc = task->sc; + struct iscsi_conn *conn = task->conn; + struct iscsi_session *sess = conn->session; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + struct cxgb3i_adapter *snic = cconn->hba->snic; + u32 sw_tag = tag_base(&snic->tag_format, task->itt, sess->age); + u32 tag = RESERVED_ITT; + + if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE)) { + struct cxgb3i_tcp_conn *c3cn = + CXGB3_TCP_CONN(tcp_conn->sock->sk); + tag = + cxgb3i_ddp_tag_reserve(snic, c3cn->tid, sw_tag, + scsi_out(sc)->length, + scsi_out(sc)->table.sgl, + scsi_out(sc)->table.nents); + } + if (tag == RESERVED_ITT) + tag = sw_tag | (snic->tag_format.rsvd_mask << + snic->tag_format.rsvd_shift); + *hdr_itt = htonl(tag); + return 0; +} + +static void cxgb3i_release_itt(struct iscsi_task *task, itt_t hdr_itt) +{ + struct scsi_cmnd *sc = task->sc; + struct iscsi_conn *conn = task->conn; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1); + struct cxgb3i_adapter *snic = cconn->hba->snic; + + hdr_itt = ntohl(hdr_itt); + if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE)) + cxgb3i_ddp_tag_release(snic, hdr_itt, + scsi_out(sc)->table.sgl, + scsi_out(sc)->table.nents); +} + +/** + * cxgb3i_host_template -- Scsi_Host_Template structure + * used when registering with the scsi mid layer + */ +static struct scsi_host_template cxgb3i_host_template = { + .module = THIS_MODULE, + .name = "Chelsio S3xx iSCSI Initiator", + .proc_name = "cxgb3i", + .queuecommand = iscsi_queuecommand, + .change_queue_depth = iscsi_change_queue_depth, + .can_queue = 128 * (ISCSI_DEF_XMIT_CMDS_MAX - 1), + .sg_tablesize = SG_ALL, + .max_sectors = 0xFFFF, + .cmd_per_lun = ISCSI_DEF_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler = iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_target_reset, + .use_clustering = DISABLE_CLUSTERING, + .slave_alloc = iscsi_slave_alloc, + .this_id = -1, +}; + +static struct iscsi_transport cxgb3i_iscsi_transport = { + .owner = THIS_MODULE, + .name = "cxgb3i", + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST + | CAP_DATADGST | CAP_DIGEST_OFFLOAD, + .param_mask = ISCSI_MAX_RECV_DLENGTH | + ISCSI_MAX_XMIT_DLENGTH | + ISCSI_HDRDGST_EN | + ISCSI_DATADGST_EN | + ISCSI_INITIAL_R2T_EN | + ISCSI_MAX_R2T | + ISCSI_IMM_DATA_EN | + ISCSI_FIRST_BURST | + ISCSI_MAX_BURST | + ISCSI_PDU_INORDER_EN | + ISCSI_DATASEQ_INORDER_EN | + ISCSI_ERL | + ISCSI_CONN_PORT | + ISCSI_CONN_ADDRESS | + ISCSI_EXP_STATSN | + ISCSI_PERSISTENT_PORT | + ISCSI_PERSISTENT_ADDRESS | + ISCSI_TARGET_NAME | ISCSI_TPGT | + ISCSI_USERNAME | ISCSI_PASSWORD | + ISCSI_USERNAME_IN | ISCSI_PASSWORD_IN | + ISCSI_FAST_ABORT | ISCSI_ABORT_TMO | + ISCSI_LU_RESET_TMO | + ISCSI_PING_TMO | ISCSI_RECV_TMO | + ISCSI_IFACE_NAME | ISCSI_INITIATOR_NAME, + .host_param_mask = ISCSI_HOST_HWADDRESS | ISCSI_HOST_IPADDRESS | + ISCSI_HOST_INITIATOR_NAME | ISCSI_HOST_NETDEV_NAME, + .get_host_param = cxgb3i_host_get_param, + /* session management */ + .create_session = cxgb3i_session_create, + .destroy_session = cxgb3i_session_destroy, + .get_session_param = iscsi_session_get_param, + /* connection management */ + .create_conn = cxgb3i_conn_create, + .bind_conn = cxgb3i_conn_bind, + .destroy_conn = iscsi_conn_teardown, + .start_conn = iscsi_conn_start, + .stop_conn = iscsi_conn_stop, + .flush_conn = cxgb3i_conn_flush, + .get_conn_param = cxgb3i_conn_get_param, + .set_param = cxgb3i_conn_set_param, + .get_stats = cxgb3i_conn_get_stats, + /* pdu xmit req. from user space */ + .send_pdu = iscsi_conn_send_pdu, + /* task */ + .init_task = iscsi_tcp_task_init, + .xmit_task = iscsi_tcp_task_xmit, + .cleanup_task = iscsi_tcp_cleanup_task, + .parse_itt = cxgb3i_parse_itt, + .reserve_itt = cxgb3i_reserve_itt, + .release_itt = cxgb3i_release_itt, + /* TCP connect/disconnect */ + .ep_connect = cxgb3i_ep_connect, + .ep_poll = cxgb3i_ep_poll, + .ep_disconnect = cxgb3i_ep_disconnect, + /* Error recovery timeout call */ + .session_recovery_timedout = iscsi_session_recovery_timedout, +}; + +int cxgb3i_iscsi_init(void) +{ + cxgb3i_scsi_transport = + iscsi_register_transport(&cxgb3i_iscsi_transport); + if (!cxgb3i_scsi_transport) { + cxgb3i_log_error("Could not register cxgb3i transport.\n"); + return -ENODEV; + } + cxgb3i_log_debug("cxgb3i transport 0x%p.\n", cxgb3i_scsi_transport); + return 0; +} + +void cxgb3i_iscsi_cleanup(void) +{ + if (cxgb3i_scsi_transport) { + cxgb3i_log_debug("cxgb3i transport 0x%p.\n", + cxgb3i_scsi_transport); + iscsi_unregister_transport(&cxgb3i_iscsi_transport); + cxgb3i_scsi_transport = NULL; + } +} diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.c b/drivers/scsi/cxgb3i/cxgb3i_offload.c new file mode 100644 index 0000000..9e80311 --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_offload.c @@ -0,0 +1,2808 @@ +/* + * Copyright (C) 2003-2008 Chelsio Communications. All rights reserved. + * + * Written by Dimitris Michailidis (dm@xxxxxxxxxxx) + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this + * release for licensing terms and conditions. + */ + +#include <linux/kallsyms.h> +#include <linux/if_vlan.h> +#include <linux/inet_diag.h> +#include <linux/version.h> + +#ifdef CONFIG_SECURITY_NETWORK +#include <linux/security.h> +#endif + +#include "cxgb3_defs.h" +#include "cxgb3_ctl_defs.h" +#include "firmware_exports.h" +#include "cxgb3i_offload.h" +#include "cxgb3i_ulp2.h" + +#define VALIDATE_SEQ 1 + +typedef int (cxgb3_cpl_handler_decl) (struct t3cdev *, + struct sk_buff *, void *); + +static cxgb3_cpl_handler_decl do_bad_cpl; +static cxgb3_cpl_handler_decl do_act_establish; +static cxgb3_cpl_handler_decl do_act_open_rpl; +static cxgb3_cpl_handler_decl do_wr_ack; +static cxgb3_cpl_handler_decl do_peer_close; +static cxgb3_cpl_handler_decl do_abort_req; +static cxgb3_cpl_handler_decl do_abort_rpl; +static cxgb3_cpl_handler_decl do_close_con_rpl; +static cxgb3_cpl_handler_decl do_iscsi_hdr; + +static struct cxgb3i_tcp_tunables default_cxgb3i_tcp_tunables = { + .max_host_sndbuf = 32 * 1024, + .max_wrs = 15, + .rx_credit_thres = 10 * 1024, + .cong_alg = -1, + .delack = 1, + .tcp_window_scaling = 1, +}; + +/* + * Protocol structure and functions for our sockets. + */ +static struct proto t3_tcp_prot; +static void chelsio_close(struct sock *, long); +static int chelsio_disconnect(struct sock *, int); +static int chelsio_destroy(struct sock *); +static void process_deferq(struct work_struct *); + +static LIST_HEAD(cxgb3_list); +static DECLARE_MUTEX(cxgb3_list_lock); + +/* + * For ULP connections HW may add headers, e.g., for digests, that aren't part + * of the messages sent by the host but that are part of the TCP payload and + * therefore consume TCP sequence space. Tx connection parameters that + * operate in TCP sequence space are affected by the HW additions and need to + * compensate for them to accurately track TCP sequence numbers. This array + * contains the compensating extra lengths for ULP packets. It is indexed by + * a packet's ULP submode. + */ +static const unsigned int cxgb3_ulp_extra_len[] = { 0, 4, 4, 8 }; + +/* + * Return the length of any HW additions that will be made to a Tx packet. + * Such additions can happen for some types of ULP packets. + */ +static inline unsigned int ulp_extra_len(const struct sk_buff *skb) +{ + return cxgb3_ulp_extra_len[skb_ulp_mode(skb) & 3]; +} + +/* + * Size of WRs in bytes. Note that we assume all devices we are handling have + * the same WR size. + */ +static unsigned int wrlen __read_mostly; + +/* + * The number of WRs needed for an skb depends on the number of page fragments + * in the skb and whether it has any payload in its main body. This maps the + * length of the gather list represented by an skb into the # of necessary WRs. + */ +static unsigned int skb_wrs[MAX_SKB_FRAGS + 2] __read_mostly; + +static void t3_init_wr_tab(unsigned int wr_len) +{ + int i; + + if (skb_wrs[1]) /* already initialized */ + return; + + for (i = 1; i < ARRAY_SIZE(skb_wrs); i++) { + int sgl_len = (3 * i) / 2 + (i & 1); + + sgl_len += 3; + skb_wrs[i] = (sgl_len <= wr_len + ? 1 : 1 + (sgl_len - 2) / (wr_len - 1)); + } + + wrlen = wr_len * 8; +} + +/* + * TOE information returned through inet_diag for offloaded connections. + */ +struct t3_inet_diag_info { + u32 toe_id; + u32 tid; + u16 wrs; + u8 ulp_mode:4; + u8 sched_class:4; + u8 ddp_enabled; + char dev_name[T3CNAMSIZ]; +}; + +/* + * Socket filter that drops everything by specifying a 0-length filter program. + */ +static struct sk_filter drop_all = {.refcnt = ATOMIC_INIT(1) }; + +/* + * This sk_buff holds a fake header-only TCP segment that we use whenever we + * need to exploit SW TCP functionality that expects TCP headers, such as + * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple + * CPUs without locking. + */ +static struct sk_buff *tcphdr_skb __read_mostly; + +/* + * Initialize state for cxgb3 API operations. + */ +int cxgb3i_tcp_init(cxgb3_cpl_handler_func *cpl_handlers) +{ + int i; + + /* + * Instialize protocol structure for our sockets. We first copy + * the standard TCP protocol structure so we end up with standard + * values for things like pointers to counters, etc. + */ + t3_tcp_prot = tcp_prot; + t3_tcp_prot.close = chelsio_close; + t3_tcp_prot.disconnect = chelsio_disconnect; + t3_tcp_prot.destroy = chelsio_destroy; + + tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); + if (!tcphdr_skb) { + printk(KERN_ERR + "Chelsio TCP offload: can't allocate sk_buff\n"); + return -1; + } + skb_put(tcphdr_skb, sizeof(struct tcphdr)); + skb_reset_transport_header(tcphdr_skb); + memset(tcphdr_skb->data, 0, tcphdr_skb->len); + /* CIPSO_V4_OPTEXIST is false for tcphdr_skb without anything extra */ + + for (i = 0; i < NUM_CPL_CMDS; i++) + cpl_handlers[i] = do_bad_cpl; + + cpl_handlers[CPL_ACT_ESTABLISH] = do_act_establish; + cpl_handlers[CPL_ACT_OPEN_RPL] = do_act_open_rpl; + cpl_handlers[CPL_PEER_CLOSE] = do_peer_close; + cpl_handlers[CPL_ABORT_REQ_RSS] = do_abort_req; + cpl_handlers[CPL_ABORT_RPL_RSS] = do_abort_rpl; + cpl_handlers[CPL_CLOSE_CON_RPL] = do_close_con_rpl; + cpl_handlers[CPL_TX_DMA_ACK] = do_wr_ack; + cpl_handlers[CPL_ISCSI_HDR] = do_iscsi_hdr; + + return 0; +} + +void cxgb3i_tcp_add(struct t3cdev *cdev, struct cxgb3_client *client) +{ + struct cxgb3i_tcp_data *cdata; + struct adap_ports *ports; + struct ofld_page_info rx_page_info; + unsigned int wr_len; + int i; + + cdata = kzalloc(sizeof *cdata, GFP_KERNEL); + if (!cdata) + return; + ports = kzalloc(sizeof *ports, GFP_KERNEL); + if (!ports) { + kfree(cdata); + return; + } + cdata->ports = ports; + + if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0 || + cdev->ctl(cdev, GET_PORTS, cdata->ports) < 0 || + cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info) < 0) { + kfree(ports); + kfree(cdata); + return; + } + + t3_init_wr_tab(wr_len); + + INIT_LIST_HEAD(&cdata->list); + cdata->cdev = cdev; + cdata->client = client; + cdata->rx_page_size = rx_page_info.page_size; + cdata->conf = default_cxgb3i_tcp_tunables; + cdata->conf.max_wrs = T3C_DATA(cdev)->max_wrs; + skb_queue_head_init(&cdata->deferq); + INIT_WORK(&cdata->deferq_task, process_deferq); + + for (i = 0; i < ports->nports; i++) + NDEV2CDATA(ports->lldevs[i]) = cdata; + + down(&cxgb3_list_lock); + list_add_tail(&cdata->list, &cxgb3_list); + up(&cxgb3_list_lock); + + return; +} + +void cxgb3i_tcp_remove(struct t3cdev *cdev) +{ + struct cxgb3i_tcp_data *cdata = CXGB3_TCP_DATA(cdev); + struct adap_ports *ports = cdata->ports; + int i; + + for (i = 0; i < ports->nports; i++) + NDEV2CDATA(ports->lldevs[i]) = NULL; + + down(&cxgb3_list_lock); + list_del(&cdata->list); + up(&cxgb3_list_lock); + + kfree(ports); + kfree(cdata); +} + +/* + * Return TRUE if the specified net device is for a port on one of our + * registered adapters. + */ +static int is_cxgb3_dev(struct net_device *dev) +{ + struct cxgb3i_tcp_data *cdata; + + down(&cxgb3_list_lock); + list_for_each_entry(cdata, &cxgb3_list, list) { + struct adap_ports *ports = cdata->ports; + int i; + + for (i = 0; i < ports->nports; i++) + if (dev == ports->lldevs[i]) { + up(&cxgb3_list_lock); + return 1; + } + } + up(&cxgb3_list_lock); + return 0; +} + +/* + * Primary cxgb3 API operations. + * ============================= + */ + +static int tcp_v4_connect_offload(struct sock *, struct sockaddr *, int); +static void t3_cleanup_rbuf(struct sock *, int); +static int t3_push_frames(struct sock *, int); +static int t3_send_reset(struct sock *, int, struct sk_buff *); +static int t3_sendskb(struct sock *, struct sk_buff *, int); + +/* + * Return connected socket to specified endpoint. + */ +int cxgb3i_tcp_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int ulp_mode) +{ + struct sock *sk; + struct cxgb3i_tcp_conn *c3cn; + int ret; + + c3cn = kzalloc(sizeof(*c3cn), GFP_KERNEL); + if (c3cn == NULL) + return -ENOMEM; + c3cn->flags = 0; + c3cn->ulp_mode = ulp_mode; + + sk = sock->sk; + CXGB3_TCP_CONN(sk) = c3cn; + + ret = tcp_v4_connect_offload(sk, uaddr, addr_len); + if (ret) { + CXGB3_TCP_CONN(sk) = NULL; + kfree(c3cn); + } + return ret; +} + +void cxgb3i_tcp_cleanup_rbuf(struct sock *sk, int copied) +{ + t3_cleanup_rbuf(sk, copied); + return; +} + +int cxgb3i_tcp_sendskb(struct sock *sk, struct sk_buff *skb, int flags) +{ + return t3_sendskb(sk, skb, flags); +} + +/* + * Protocol operations. + * ==================== + */ + +static int make_close_transition(struct sock *); +static void close_conn(struct sock *); +static void t3_purge_write_queue(struct sock *); + +/* + * Release a socket's local TCP port if the socket is bound. This is normally + * done by tcp_done() but because we need to wait for HW to release TIDs we + * usually call tcp_done at a later time than the SW stack would have. This + * can be used to release the port earlier so the SW stack can reuse it before + * we are done with the connection. + */ +static inline void release_tcp_port(struct sock *sk) +{ + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(sk); +} + +static void chelsio_close(struct sock *sk, long timeout) +{ + int data_lost, old_state; + + lock_sock(sk); + sk->sk_shutdown |= SHUTDOWN_MASK; + + /* + * We need to flush the receive buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! Make a note + * of whether any received data will be lost so we can decide whether + * to FIN or RST. + */ + data_lost = skb_queue_len(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_receive_queue); + + if (sk->sk_state == TCP_CLOSE) /* Nothing if we are already closed */ + ; + else if (data_lost || sk->sk_state == TCP_SYN_SENT) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + t3_send_reset(sk, CPL_ABORT_SEND_RST, NULL); + release_tcp_port(sk); + goto unlock; + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + } else if (make_close_transition(sk)) { /* Regular FIN-based close */ + close_conn(sk); + } + + if (timeout) + sk_stream_wait_close(sk, timeout); + +unlock: + old_state = sk->sk_state; + sock_hold(sk); /* must last past the potential inet_csk_destroy_sock */ + sock_orphan(sk); + atomic_inc(sk->sk_prot->orphan_count); + + release_sock(sk); /* Final release_sock in connection's lifetime. */ + + /* + * There are no more user references at this point. Grab the socket + * spinlock and finish the close. + */ + local_bh_disable(); + bh_lock_sock(sk); + + /* + * Because the socket was orphaned before the bh_lock_sock + * either the backlog or a BH may have already destroyed it. + * Bail out if so. + */ + if (old_state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; + + if (sk->sk_state == TCP_FIN_WAIT2 && tcp_sk(sk)->linger2 < 0 && + !c3cn_flag(sk, C3CN_ABORT_SHUTDOWN)) { + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC); + if (skb) { + t3_send_reset(sk, CPL_ABORT_SEND_RST, skb); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + } + } + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +static int chelsio_disconnect(struct sock *sk, int flags) +{ + printk(KERN_ERR "chelsio_disconnect not implemented\n"); + return -ENOTSUPP; +} + +/* + * Our version of tcp_v4_destroy_sock(). We need to do this because + * tcp_writequeue_purge() that is used in the original doesn't quite match + * our needs. If we ever hook into the memory management of the SW stack we + * may be able to use tcp_v4_destroy_sock() directly. + */ +static int chelsio_destroy(struct sock *sk) +{ + struct cxgb3i_tcp_conn *c3cn; + + C3CN_ULP_MODE(sk) = ULP_MODE_NONE; + t3_purge_write_queue(sk); + c3cn = CXGB3_TCP_CONN(sk); + CXGB3_TCP_CONN(sk) = NULL; + kfree(c3cn); + return tcp_prot.destroy(sk); +} + +/* + * Local utility routines used to implement primary cxgb3 API operations. + * ====================================================================== + */ + +static int tcp_connect_offload(struct sock *); +static u32 t3_send_rx_credits(struct sock *, u32, u32, int); +static void mk_act_open_req(struct sock *, struct sk_buff *, + unsigned int, const struct l2t_entry *); +static int wait_for_mem(struct sock *, long *); +static void skb_entail(struct sock *, struct sk_buff *, int); + +static inline int is_t3a(const struct t3cdev *cdev) +{ + return cdev->type == T3A; +} + +/* + * Determine the value of a packet's ->priority field. Bit 0 determines + * whether the packet should use a control Tx queue, bits 1..3 determine + * the queue set to use. + */ +static inline unsigned int mkprio(unsigned int cntrl, const struct sock *sk) +{ + return cntrl; +} + +/* + * Returns true if an sk_buff carries urgent data. + */ +static inline int skb_urgent(struct sk_buff *skb) +{ + return (CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_URG) != 0; +} + +static inline void reset_wr_list(struct tcp_sock *tp) +{ + tp->forward_skb_hint = NULL; +} + +/* + * Add a WR to a socket's list of pending WRs. This is a singly-linked list + * of sk_buffs operating as a FIFO. We use the following sock and sk_buff + * fields to maintain it: + * - sock.forward_skb_hint, sock.retransmit_skb_hint as head and tail pointers + * - sk_buff.sp as packet next pointer + */ +static inline void enqueue_wr(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->sp = NULL; + + /* + * We want to take an extra reference since both us and the driver + * need to free the packet before it's really freed. We know there's + * just one user currently so we use atomic_set rather than skb_get + * to avoid the atomic op. + */ + atomic_set(&skb->users, 2); + + if (!tp->forward_skb_hint) + tp->forward_skb_hint = skb; + else + tp->retransmit_skb_hint->sp = (void *)skb; + tp->retransmit_skb_hint = skb; +} + +/* Returns bits 2:7 of a socket's TOS field */ +#define SK_TOS(sk) ((inet_sk(sk)->tos >> 2) & M_TOS) + +/* + * The next two functions calculate the option 0 value for a socket. + */ +static inline unsigned int calc_opt0h(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return V_NAGLE((tp->nonagle & TCP_NAGLE_OFF) == 0) | + V_KEEP_ALIVE(sock_flag(sk, SOCK_KEEPOPEN) != 0) | F_TCAM_BYPASS | + V_WND_SCALE(tp->rx_opt.rcv_wscale) | V_MSS_IDX(C3CN_MSS_IDX(sk)); +} + +static inline unsigned int calc_opt0l(struct sock *sk) +{ + unsigned int tos; + struct tcp_sock *tp = tcp_sk(sk); + + tos = SK_TOS(sk); + if ((tos & 0x38) == 0x30) /* suppress values in special range */ + tos = 0; + + return V_TOS(tos) | V_ULP_MODE(C3CN_ULP_MODE(sk)) | + V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32) M_RCV_BUFSIZ)); +} + +static inline unsigned int calc_opt2(const struct sock *sk) +{ + const struct t3cdev *cdev = C3CN_CDEV(sk); + int flv_valid = CXGB3_TCP_TUNABLE(cdev, cong_alg) != -1; + + return V_FLAVORS_VALID(flv_valid) | + V_CONG_CONTROL_FLAVOR(flv_valid ? CXGB3_TCP_TUNABLE(cdev, cong_alg) + : 0); +} + +static inline void make_tx_data_wr(struct sock *sk, struct sk_buff *skb, + int len) +{ + struct tx_data_wr *req; + struct tcp_sock *tp = tcp_sk(sk); + + skb_reset_transport_header(skb); + req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(C3CN_TID(sk))); + req->sndseq = htonl(tp->snd_nxt); + /* len includes the length of any HW ULP additions */ + req->len = htonl(len); + req->param = htonl(V_TX_PORT(C3CN_L2T(sk)->smt_idx)); + /* V_TX_ULP_SUBMODE sets both the mode and submode */ + req->flags = htonl(V_TX_ULP_SUBMODE(skb_ulp_mode(skb)) | + V_TX_URG(skb_urgent(skb)) | + V_TX_SHOVE((!c3cn_flag(sk, C3CN_TX_MORE_DATA)) && + (skb_peek(&sk->sk_write_queue) ? 0 : 1))); + + if (!c3cn_flag(sk, C3CN_TX_DATA_SENT)) { + + req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | + V_TX_CPU_IDX(C3CN_QSET(sk))); + + /* Sendbuffer is in units of 32KB. + */ + req->param |= htonl(V_TX_SNDBUF(sk->sk_sndbuf >> 15)); + c3cn_set_flag(sk, C3CN_TX_DATA_SENT); + } +} + +static int tcp_v4_connect_offload(struct sock *sk, + struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + __be32 daddr, nexthop; + int tmp; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt && inet->opt->srr) { + if (!daddr) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, inet->sport, usin->sin_port, sk, 1); + if (tmp < 0) { + if (tmp == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return tmp; + } + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (!inet->opt || !inet->opt->srr) + daddr = rt->rt_dst; + + if (!inet->saddr) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { + struct inet_peer *peer = rt->peer; + /* + * VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state + * TIME-WAIT * and initialize rx_opt.ts_recent from it, + * when trying new connection. + */ + if (peer != NULL && + peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet->opt) + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; + + tp->rx_opt.mss_clamp = 536; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = inet_hash_connect(&tcp_death_row, sk); + if (err) + goto failure; + + err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk); + if (err) + goto failure; + + /* OK, now commit destination to socket. */ + sk->sk_gso_type = SKB_GSO_TCPV4; + sk_setup_caps(sk, &rt->u.dst); + + if (tcp_connect_offload(sk)) + return 0; + /* + * If we get here, we don't have an offload connection so simply + * return a failure. + */ + err = -ENOTSUPP; + +failure: + /* + * This unhashes the socket and releases the local port, + * if necessary. + */ + tcp_set_state(sk, TCP_CLOSE); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + return err; +} + +static inline int is_delack_mode_valid(struct t3cdev *cdev, struct sock *sk) +{ + return (!C3CN_ULP_MODE(sk) + || (C3CN_ULP_MODE(sk) == ULP_MODE_TCPDDP && cdev->type >= T3A)); +} + +/* + * Set of states for which we should return RX credits. + */ +#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) + +/* + * Called after some received data has been read. It returns RX credits + * to the HW for the amount of data processed. + */ +static void t3_cleanup_rbuf(struct sock *sk, int copied) +{ + struct tcp_sock *tp; + struct t3cdev *cdev; + int dack_mode, must_send; + u32 thres, credits, dack = 0; + + if (!sk_in_state(sk, CREDIT_RETURN_STATE)) + return; + + tp = tcp_sk(sk); + credits = tp->copied_seq - tp->rcv_wup; + if (unlikely(!credits)) + return; + + cdev = C3CN_CDEV(sk); + thres = CXGB3_TCP_TUNABLE(cdev, rx_credit_thres); + + if (unlikely(thres == 0)) + return; + + if (is_delack_mode_valid(cdev, sk)) { + dack_mode = CXGB3_TCP_TUNABLE(cdev, delack); + if (unlikely(dack_mode != C3CN_DELAK_MODE(sk))) { + u32 r = tp->rcv_nxt - C3CN_DELAK_SEQ(sk); + + if (r >= tp->rcv_wnd || r >= 16 * tp->rx_opt.mss_clamp) + dack = (F_RX_DACK_CHANGE | + V_RX_DACK_MODE(dack_mode)); + } + } else + dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + + /* + * For coalescing to work effectively ensure the receive window has + * at least 16KB left. + */ + must_send = credits + 16384 >= tp->rcv_wnd; + + if (must_send || credits >= thres) + tp->rcv_wup += t3_send_rx_credits(sk, credits, dack, must_send); +} + +/* + * Generic ARP failure handler that discards the buffer. + */ +static void arp_failure_discard(struct t3cdev *cdev, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* + * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a + * socket's send queue and sends them on to the TOE. Must be called with the + * socket lock held. Returns the amount of send buffer space that was freed + * as a result of sending queued data to the TOE. + */ +static int t3_push_frames(struct sock *sk, int req_completion) +{ + int total_size = 0; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct t3cdev *cdev; + struct cxgb3i_tcp_data *cdata; + + if (unlikely(sk_in_state(sk, TCPF_SYN_SENT | TCPF_CLOSE))) + return 0; + + /* + * We shouldn't really be called at all after an abort but check just + * in case. + */ + if (unlikely(c3cn_flag(sk, C3CN_ABORT_SHUTDOWN))) + return 0; + + cdev = C3CN_CDEV(sk); + cdata = CXGB3_TCP_DATA(cdev); + + while (C3CN_WR_AVAIL(sk) + && (skb = skb_peek(&sk->sk_write_queue)) != NULL + && !c3cn_flag(sk, C3CN_TX_WAIT_IDLE) + && (!(CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_HOLD) + || skb_queue_len(&sk->sk_write_queue) > 1)) { + + int len = skb->len; /* length before skb_push */ + int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len); + int wrs_needed = skb_wrs[frags]; + + if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen) + wrs_needed = 1; + + WARN_ON(frags >= ARRAY_SIZE(skb_wrs) || wrs_needed < 1); + if (C3CN_WR_AVAIL(sk) < wrs_needed) + break; + + __skb_unlink(skb, &sk->sk_write_queue); + skb->priority = mkprio(CPL_PRIORITY_DATA, sk); + skb->csum = wrs_needed; /* remember this until the WR_ACK */ + C3CN_WR_AVAIL(sk) -= wrs_needed; + C3CN_WR_UNACKED(sk) += wrs_needed; + enqueue_wr(tp, skb); + + if (likely(CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_NEED_HDR)) { + len += ulp_extra_len(skb); + make_tx_data_wr(sk, skb, len); + tp->snd_nxt += len; + tp->lsndtime = tcp_time_stamp; + if ((req_completion + && C3CN_WR_UNACKED(sk) == wrs_needed) + || (CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_COMPL) + || C3CN_WR_UNACKED(sk) >= C3CN_WR_MAX(sk) / 2) { + struct work_request_hdr *wr = cplhdr(skb); + + wr->wr_hi |= htonl(F_WR_COMPL); + C3CN_WR_UNACKED(sk) = 0; + } + CXGB3_TCP_SKB_CB(skb)->flags &= ~C3CB_FLAG_NEED_HDR; + } else if (skb->data[0] == FW_WROPCODE_OFLD_CLOSE_CON) + c3cn_set_flag(sk, C3CN_CLOSE_CON_REQUESTED); + + total_size += skb->truesize; + if (CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_BARRIER) + c3cn_set_flag(sk, C3CN_TX_WAIT_IDLE); + set_arp_failure_handler(skb, arp_failure_discard); + l2t_send(cdev, skb, C3CN_L2T(sk)); + } + sk->sk_wmem_queued -= total_size; + return total_size; +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(struct t3cdev *cdev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + req->cmd = CPL_ABORT_NO_RST; + cxgb3_ofld_send(cdev, skb); +} + +/* + * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do + * not send multiple ABORT_REQs for the same connection and also that we do + * not try to send a message after the connection has closed. Returns 1 if + * an ABORT_REQ wasn't generated after all, 0 otherwise. + */ +static int t3_send_reset(struct sock *sk, int mode, struct sk_buff *skb) +{ + struct cpl_abort_req *req; + struct tcp_sock *tp = tcp_sk(sk); + unsigned int tid = C3CN_TID(sk); + + if (unlikely(c3cn_flag(sk, C3CN_ABORT_SHUTDOWN) || !C3CN_CDEV(sk))) { + if (skb) + __kfree_skb(skb); + return 1; + } + + c3cn_set_flag(sk, C3CN_ABORT_RPL_PENDING); + c3cn_set_flag(sk, C3CN_ABORT_SHUTDOWN); + + /* Purge the send queue so we don't send anything after an abort. */ + t3_purge_write_queue(sk); + + if (c3cn_flag(sk, C3CN_CLOSE_CON_REQUESTED) && is_t3a(C3CN_CDEV(sk))) + mode |= CPL_ABORT_POST_CLOSE_REQ; + + if (!skb) + skb = alloc_skb(sizeof(*req), GFP_KERNEL | __GFP_NOFAIL); + skb->priority = mkprio(CPL_PRIORITY_DATA, sk); + set_arp_failure_handler(skb, abort_arp_failure); + + req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); + req->rsvd0 = htonl(tp->snd_nxt); + req->rsvd1 = !c3cn_flag(sk, C3CN_TX_DATA_SENT); + req->cmd = mode; + if (sk->sk_state == TCP_SYN_SENT) + __skb_queue_tail(&tp->out_of_order_queue, skb); + else + l2t_send(C3CN_CDEV(sk), skb, C3CN_L2T(sk)); + return 0; +} + +/* + * This must be called with the socket locked, otherwise dev may be NULL. + */ +static inline int chelsio_wspace(const struct sock *sk) +{ + struct t3cdev *dev = C3CN_CDEV(sk); + + return (dev ? (CXGB3_TCP_TUNABLE(dev, max_host_sndbuf) + - sk->sk_wmem_queued) + : 0); +} + +static inline int tcp_memory_free(struct sock *sk) +{ + return chelsio_wspace(sk) > 0; +} + +/* + * Add a list of skbs to a socket send queue. This interface is intended for + * use by in-kernel ULPs. The skbs must comply with the max size limit of the + * device and have a headroom of at least TX_HEADER_LEN bytes. + */ +static int t3_sendskb(struct sock *sk, struct sk_buff *skb, int flags) +{ + struct sk_buff *next; + struct tcp_sock *tp = tcp_sk(sk); + int err, copied = 0; + long timeo; + + lock_sock(sk); + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + (err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto out_err; + + /* + * We check for send buffer space once for the whole skb list. It + * isn't critical if we end up overrunning the send buffer limit as we + * do not allocate any new memory. The benefit is we don't need to + * perform intermediate packet pushes. + */ + while (!tcp_memory_free(sk)) { + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = wait_for_mem(sk, &timeo); + if (err) + goto out_err; + } + + while (skb) { + if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) { + err = -EINVAL; + goto out_err; + } + + next = skb->next; + skb->next = NULL; + skb_entail(sk, skb, C3CB_FLAG_NO_APPEND | C3CB_FLAG_NEED_HDR); + copied += skb->len; + tp->write_seq += skb->len + ulp_extra_len(skb); + skb = next; + } +done: + if (likely(skb_queue_len(&sk->sk_write_queue))) + t3_push_frames(sk, 1); + release_sock(sk); + return copied; + +out_err: + if (copied == 0) + copied = sk_stream_error(sk, flags, err); + goto done; +} + +/* + * Low-level utility routines for primary API functions. + * ===================================================== + */ +/* routines to implement CPL message processing */ +static void sock_act_establish(struct sock *, struct sk_buff *); +static void active_open_failed(struct sock *, struct sk_buff *); +static void wr_ack(struct sock *, struct sk_buff *); +static void do_peer_fin(struct sock *, struct sk_buff *); +static void process_abort_req(struct sock *, struct sk_buff *); +static void process_abort_rpl(struct sock *, struct sk_buff *); +static void process_close_con_rpl(struct sock *, struct sk_buff *); +static void process_rx_iscsi_hdr(struct sock *, struct sk_buff *); + +static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t); + +static int t3_connect(struct sock *, struct net_device *); +static void tcp_uncork(struct sock *); +static void tcp_push(struct sock *, int); +static void fail_act_open(struct sock *, int); +static void init_offload_sk(struct sock *, struct t3cdev *, struct dst_entry *); +static int t3_backlog_rcv(struct sock *, struct sk_buff *); +static void t3_write_space(struct sock *); + +/* + * Insert a socket to the TID table and take an extra reference. + */ +static inline void sk_insert_tid(struct cxgb3i_tcp_data *cdata, struct sock *sk, + unsigned int tid) +{ + sock_hold(sk); + cxgb3_insert_tid(cdata->cdev, cdata->client, sk, tid); +} + +static inline void free_atid(struct t3cdev *cdev, unsigned int tid) +{ + struct sock *sk = cxgb3_free_atid(cdev, tid); + if (sk) + sock_put(sk); +} + +/* + * This function is intended for allocations of small control messages. + * Such messages go as immediate data and usually the pakets are freed + * immediately. We maintain a cache of one small sk_buff and use it whenever + * it is available (has a user count of 1). Otherwise we get a fresh buffer. + */ +#define CTRL_SKB_LEN 120 + +static struct sk_buff *alloc_ctrl_skb(const struct sock *sk, int len) +{ + struct sk_buff *skb = C3CN_CTRL_SKB_CACHE(sk); + + if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) { + __skb_trim(skb, 0); + atomic_set(&skb->users, 2); + } else if (likely(!in_atomic())) + skb = alloc_skb(len, GFP_ATOMIC | __GFP_NOFAIL); + else + skb = alloc_skb(len, GFP_ATOMIC); + return skb; +} + +/** + * cxgb3_egress_dev - return the cxgb3 egress device or NULL if the egress + * device isn't one of our ports. + * + * @root_dev: the root device anchoring the search + * @sk: the socket used to determine egress port in bonding mode + * @context: in bonding mode, indicates a connection set up or failover + * + * Given a root network device it returns the physical egress device that is a + * descendant of the root device. The root device may be either a physical + * device, in which case it is the device returned, or a virtual device, such + * as a VLAN or bonding device. In case of a bonding device the search + * considers the decisions of the bonding device given its mode to locate the + * correct egress device. + */ +static struct net_device *cxgb3_egress_dev(struct net_device *root_dev, + struct sock *sk, int context) +{ + while (root_dev) { + if (root_dev->priv_flags & IFF_802_1Q_VLAN) + root_dev = vlan_dev_info(root_dev)->real_dev; + else if (is_cxgb3_dev(root_dev)) + return root_dev; + else + return NULL; + } + return NULL; +} + +/* + * Return TRUE if we're able to establish an offload connection; otherwise + * return FALSE. + */ +static int tcp_connect_offload(struct sock *sk) +{ + struct net_device *dev = cxgb3_egress_dev(__sk_dst_get(sk)->dev, + sk, 0); + if (dev == NULL) + return 0; + return t3_connect(sk, dev) == 0; +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + sock_hold(sk); + bh_lock_sock(sk); + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) { + if (!sock_owned_by_user(sk)) { + fail_act_open(sk, EHOSTUNREACH); + __kfree_skb(skb); + } else { + /* + * Smart solution: Synthesize an ACTIVE_OPEN_RPL in the + * existing sk_buff and queue it to the backlog. We + * are certain the sk_buff is not shared. We also + * don't bother trimming the buffer. + */ + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + rpl->ot.opcode = CPL_ACT_OPEN_RPL; + rpl->status = CPL_ERR_ARP_MISS; + SET_BLOG_CPL_HANDLER(skb, active_open_failed); + sk_add_backlog(sk, skb); + } + } + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Switch a socket to the offload protocol operations. Note that the offload + * operations do not contain the offload backlog handler, we install that + * directly to the socket. + */ +static inline void install_offload_ops(struct sock *sk) +{ + sk->sk_prot = &t3_tcp_prot; + sk->sk_backlog_rcv = t3_backlog_rcv; + sk->sk_write_space = t3_write_space; + + if (sk->sk_filter) + sk_filter_uncharge(sk, sk->sk_filter); + sk->sk_filter = &drop_all; + sk_filter_charge(sk, sk->sk_filter); + + c3cn_set_flag(sk, C3CN_OFFLOADED); +} + +/* + * Max receive window supported by HW in bytes. Only a small part of it can + * be set through option0, the rest needs to be set through RX_DATA_ACK. + */ +#define MAX_RCV_WND ((1U << 27) - 1) + +/* + * Min receive window. We want it to be large enough to accommodate receive + * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. + */ +#define MIN_RCV_WND (24 * 1024U) + +/* + * Determine the receive window scaling factor given a target max + * receive window. + */ +static inline int select_rcv_wscale(int space, int wscale_ok, int window_clamp) +{ + int wscale = 0; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + if (window_clamp && window_clamp < space) + space = window_clamp; + + if (wscale_ok) + for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; + return wscale; +} + +/* + * Send an active open request. + */ +static int t3_connect(struct sock *sk, struct net_device *dev) +{ + struct cxgb3i_tcp_data *cdata = NDEV2CDATA(dev); + struct t3cdev *cdev = cdata->cdev; + struct cxgb3i_tcp_conn *c3cn = CXGB3_TCP_CONN(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct sk_buff *skb; + + /* + * Initialize connection data. Note that the flags and ULP mode are + * initialized higher up ... + */ + c3cn->dev = dev; + c3cn->cdev = cdev; + c3cn->tid = cxgb3_alloc_atid(cdev, cdata->client, sk); + if (c3cn->tid < 0) + goto out_err; + c3cn->qset = 0; + c3cn->l2t = t3_l2t_get(cdev, dst->neighbour, dev); + if (!c3cn->l2t) + goto free_tid; + + skb = alloc_skb(sizeof(struct cpl_act_open_req), + GFP_KERNEL | __GFP_NOFAIL); + skb->sk = sk; + set_arp_failure_handler(skb, act_open_req_arp_failure); + + sock_hold(sk); + + install_offload_ops(sk); + + init_offload_sk(sk, cdev, dst); + tp->rx_opt.rcv_wscale = select_rcv_wscale(tcp_full_space(sk), + CXGB3_TCP_TUNABLE(cdev, tcp_window_scaling), + tp->window_clamp); + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + + mk_act_open_req(sk, skb, c3cn->tid, c3cn->l2t); + l2t_send(cdev, skb, c3cn->l2t); + return 0; + +free_tid: + free_atid(cdev, c3cn->tid); + c3cn->tid = 0; +out_err: + return -1; +} + +/* + * State transitions and actions for close. Note that if we are in SYN_SENT + * we remain in that state as we cannot control a connection while it's in + * SYN_SENT; such connections are allowed to establish and are then aborted. + */ +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_SYN_SENT, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +/* + * Perform a state transition during close and return the actions indicated + * for the transition. Do not make this function inline, the main reason + * it exists at all is to avoid multiple inlining of tcp_set_state. + */ +static int make_close_transition(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + + tcp_set_state(sk, next & TCP_STATE_MASK); + return next & TCP_ACTION_FIN; +} + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail + * under any circumstances. We take the easy way out and always queue the + * message to the write_queue. We can optimize the case where the queue is + * already empty though the optimization is probably not worth it. + */ +static void close_conn(struct sock *sk) +{ + struct sk_buff *skb; + struct cpl_close_con_req *req; + unsigned int tid = C3CN_TID(sk); + + skb = alloc_skb(sizeof(struct cpl_close_con_req), + GFP_KERNEL | __GFP_NOFAIL); + req = (struct cpl_close_con_req *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = htonl(tcp_sk(sk)->write_seq); + + tcp_uncork(sk); + skb_entail(sk, skb, C3CB_FLAG_NO_APPEND); + if (sk->sk_state != TCP_SYN_SENT) + t3_push_frames(sk, 1); +} + +static void tcp_uncork(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->nonagle & TCP_NAGLE_CORK) { + tp->nonagle &= ~TCP_NAGLE_CORK; + tcp_push(sk, 0); + } +} + +static inline void mark_urg(struct tcp_sock *tp, int flags, struct sk_buff *skb) +{ + if (unlikely(flags & MSG_OOB)) { + tp->snd_up = tp->write_seq; + CXGB3_TCP_SKB_CB(skb)->flags = + C3CB_FLAG_URG | C3CB_FLAG_BARRIER | C3CB_FLAG_NO_APPEND | + C3CB_FLAG_NEED_HDR; + } +} + +/* + * Returns true if a TCP socket is corked. + */ +static inline int corked(const struct tcp_sock *tp, int flags) +{ + return (flags & MSG_MORE) | (tp->nonagle & TCP_NAGLE_CORK); +} + +/* + * Returns true if a connection should send more data to the TOE ASAP. + */ +static inline int should_push(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return (!(C3CN_WR_MAX(sk) - C3CN_WR_AVAIL(sk)) || + (tp->nonagle & TCP_NAGLE_OFF)); +} + +/* + * Decide if the last frame on the send queue needs any special annotations + * (e.g., marked URG) and whether it should be transmitted immediately or + * held for additional data. This is the only routine that performs the full + * suite of tests for a Tx packet and therefore must be called for the last + * packet added by the various send*() APIs. + */ +static void tcp_push(struct sock *sk, int flags) +{ + int qlen = skb_queue_len(&sk->sk_write_queue); + + if (likely(qlen)) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_write_queue.prev; + + mark_urg(tp, flags, skb); + + if (!(CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_NO_APPEND) && + corked(tp, flags)) { + CXGB3_TCP_SKB_CB(skb)->flags |= C3CB_FLAG_HOLD; + return; + } + + CXGB3_TCP_SKB_CB(skb)->flags &= ~C3CB_FLAG_HOLD; + if (qlen == 1 && + ((CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_NO_APPEND) || + should_push(sk))) + t3_push_frames(sk, 1); + } +} + +/* + * Wait for memory to become available, either space in a socket's send buffer + * or system memory. + */ +static int wait_for_mem(struct sock *sk, long *timeout) +{ + int sndbuf, err = 0; + long vm_wait = 0; + long current_timeo = *timeout; + DEFINE_WAIT(wait); + + /* + * We open code tcp_memory_free() because we need it outside the + * socket lock and chelsio_wspace() isn't safe there. + */ + sndbuf = CXGB3_TCP_TUNABLE(C3CN_CDEV(sk), max_host_sndbuf); + + if (sndbuf > sk->sk_wmem_queued) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + for (;;) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + err = -EPIPE; + break; + } + if (!*timeout) { + err = -EAGAIN; + break; + } + if (signal_pending(current)) { + err = sock_intr_errno(*timeout); + break; + } + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sndbuf > sk->sk_wmem_queued && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + release_sock(sk); + + if (!sk->sk_err && !(sk->sk_shutdown & SEND_SHUTDOWN) && + (sndbuf <= sk->sk_wmem_queued || vm_wait)) + current_timeo = schedule_timeout(current_timeo); + + lock_sock(sk); + sk->sk_write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeout; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeout = current_timeo; + } + + finish_wait(sk->sk_sleep, &wait); + return err; +} + +static void skb_entail(struct sock *sk, struct sk_buff *skb, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + + CXGB3_TCP_SKB_CB(skb)->seq = tp->write_seq; + CXGB3_TCP_SKB_CB(skb)->flags = flags; + __skb_queue_tail(&sk->sk_write_queue, skb); + sk->sk_wmem_queued += skb->truesize; + + /* Do not share pages across sk_buffs */ + if (sk->sk_sndmsg_page && sk->sk_sndmsg_off) { + put_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } +} + +/* + * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are + * permitted to return without sending the message in case we cannot allocate + * an sk_buff. Returns the number of credits sent. + */ +static u32 t3_send_rx_credits(struct sock *sk, u32 credits, u32 dack, + int nofail) +{ + struct sk_buff *skb; + struct cpl_rx_data_ack *req; + + skb = (nofail ? alloc_ctrl_skb(sk, sizeof(*req)) + : alloc_skb(sizeof(*req), GFP_ATOMIC)); + if (!skb) + return 0; + + req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, C3CN_TID(sk))); + req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); + skb->priority = mkprio(CPL_PRIORITY_ACK, sk); + cxgb3_ofld_send(C3CN_CDEV(sk), skb); + return credits; +} + +static void mk_act_open_req(struct sock *sk, struct sk_buff *skb, + unsigned int atid, const struct l2t_entry *e) +{ + struct cpl_act_open_req *req; + + skb->priority = mkprio(CPL_PRIORITY_SETUP, sk); + req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); + req->local_port = inet_sk(sk)->sport; + req->peer_port = inet_sk(sk)->dport; + req->local_ip = inet_sk(sk)->saddr; + req->peer_ip = inet_sk(sk)->daddr; + req->opt0h = htonl(calc_opt0h(sk) | V_L2T_IDX(e->idx) | + V_TX_CHANNEL(e->smt_idx)); + req->opt0l = htonl(calc_opt0l(sk)); + req->params = 0; + req->opt2 = htonl(calc_opt2(sk)); +} + +/* + * Our analog of tcp_free_skb(). + */ +static inline void chelsio_tcp_free_skb(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_wmem_queued -= skb->truesize; + __kfree_skb(skb); +} + +static void t3_purge_write_queue(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_write_queue))) + chelsio_tcp_free_skb(sk, skb); +} + +/* + * Definitions and declarations for CPL handler functions. + * ======================================================= + */ + +#ifdef VALIDATE_TID +#define VALIDATE_SOCK(sk) \ + do { \ + if (unlikely(!(sk))) \ + return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; \ + } while (0) +#else +#define VALIDATE_SOCK(sk) do {} while (0) +#endif + +static void t3_idiag_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + if (ext & (1 << INET_DIAG_CONG)) { + struct rtattr *rta; + struct t3_inet_diag_info *info; + + rta = __RTA_PUT(skb, INET_DIAG_CONG + 1, sizeof(*info)); + info = RTA_DATA(rta); + info->toe_id = TOE_ID_CHELSIO_T3; + info->tid = C3CN_TID(sk); + info->wrs = C3CN_WR_MAX(sk) - C3CN_WR_AVAIL(sk); + info->ulp_mode = C3CN_ULP_MODE(sk); + strncpy(info->dev_name, C3CN_CDEV(sk)->name, + sizeof info->dev_name); +rtattr_failure: + ; + } +} + +#define T3_CONG_OPS(s) \ + { .name = s, .owner = THIS_MODULE, .get_info = t3_idiag_get_info } + +static struct tcp_congestion_ops t3_cong_ops[] = { + T3_CONG_OPS("reno"), T3_CONG_OPS("tahoe"), + T3_CONG_OPS("newreno"), T3_CONG_OPS("highspeed") +}; + +/* + * Similar to process_cpl_msg() but takes an extra socket reference around the + * call to the handler. Should be used if the handler may drop a socket + * reference. + */ +static inline void process_cpl_msg_ref(void (*fn) (struct sock *, + struct sk_buff *), + struct sock *sk, struct sk_buff *skb) +{ + sock_hold(sk); + process_cpl_msg(fn, sk, skb); + sock_put(sk); +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +static inline void t3_set_ca_ops(struct sock *sk, + struct tcp_congestion_ops *t_ops) +{ + inet_csk(sk)->icsk_ca_ops = t_ops; +} + +/* + * Returns true if a socket cannot accept new Rx data. + */ +static inline int sk_no_receive(const struct sock *sk) +{ + return (sk->sk_shutdown & RCV_SHUTDOWN); +} + +/* + * Returns true if we need to explicitly request RST when we receive new data + * on an RX-closed connection. + */ +static inline int need_rst_on_excess_rx(const struct sock *sk) +{ + return 1; +} + +/* + * A helper function that aborts a connection and increments the given MIB + * counter. The supplied skb is used to generate the ABORT_REQ message if + * possible. Must be called with softirqs disabled. + */ +static inline void abort_conn(struct sock *sk, struct sk_buff *skb, int mib) +{ + struct sk_buff *abort_skb; + + abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req), + GFP_ATOMIC); + if (abort_skb) { + NET_INC_STATS_BH(mib); + t3_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb); + } +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static inline int is_neg_adv_abort(unsigned int status) +{ + return (status == CPL_ERR_RTX_NEG_ADVICE + || status == CPL_ERR_PERSIST_NEG_ADVICE); +} + +/* + * Process a received packet with an unknown/unexpected CPL opcode. + */ +static int do_bad_cpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + printk(KERN_ERR "%s: received bad CPL command %u\n", cdev->name, + *skb->data); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; +} + +/* + * CPL handler functions. + * ====================== + */ + +/* + * Process a CPL_ACT_ESTABLISH message. + */ +static int do_act_establish(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct sock *sk = (struct sock *)ctx; + struct cxgb3i_tcp_data *cdata = CXGB3_TCP_DATA(cdev); + + /* + * It's OK if the TID is currently in use, the owning socket may have + * backlogged its last CPL message(s). Just take it away. + */ + C3CN_TID(sk) = tid; + sk_insert_tid(cdata, sk, tid); + free_atid(cdev, atid); + + C3CN_QSET(sk) = G_QNUM(ntohl(skb->csum)); + + process_cpl_msg(sock_act_establish, sk, skb); + return 0; +} + +/* + * Process an ACT_OPEN_RPL CPL message. + */ +static int do_act_open_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk = (struct sock *)ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + VALIDATE_SOCK(sk); + + if (cdev->type != T3A && act_open_has_tid(rpl->status)) + cxgb3_queue_tid_release(cdev, GET_TID(rpl)); + + process_cpl_msg_ref(active_open_failed, sk, skb); + return 0; +} + +/* + * Handler RX_ISCSI_HDR CPL messages. + */ +static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk = (struct sock *)ctx; + VALIDATE_SOCK(sk); + process_cpl_msg(process_rx_iscsi_hdr, sk, skb); + return 0; +} + +/* + * Handler for TX_DATA_ACK CPL messages. + */ +static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk = (struct sock *)ctx; + + VALIDATE_SOCK(sk); + + process_cpl_msg(wr_ack, sk, skb); + return 0; +} + +/* + * Handler for PEER_CLOSE CPL messages. + */ +static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk = (struct sock *)ctx; + + VALIDATE_SOCK(sk); + + process_cpl_msg_ref(do_peer_fin, sk, skb); + return 0; +} + +/* + * Handle an ABORT_REQ_RSS CPL message. + */ +static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + const struct cpl_abort_req_rss *req = cplhdr(skb); + struct sock *sk = (struct sock *)ctx; + + if (is_neg_adv_abort(req->status)) { + __kfree_skb(skb); + return 0; + } + + VALIDATE_SOCK(sk); + + /* + * Save the offload device in the skb, we may process this message + * after the socket has closed. + */ + BLOG_SKB_CB(skb)->cdev = C3CN_CDEV(sk); + + process_cpl_msg_ref(process_abort_req, sk, skb); + return 0; +} + +/* + * Handle an ABORT_RPL_RSS CPL message. + */ +static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk; + struct cpl_abort_rpl_rss *rpl = cplhdr(skb); + + /* + * Ignore replies to post-close aborts indicating that the abort was + * requested too late. These connections are terminated when we get + * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss + * arrives the TID is either no longer used or it has been recycled. + */ + if (rpl->status == CPL_ERR_ABORT_FAILED) { +discard: + __kfree_skb(skb); + return 0; + } + + sk = (struct sock *)ctx; + + /* + * Sometimes we've already closed the socket, e.g., a post-close + * abort races with ABORT_REQ_RSS, the latter frees the socket + * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, + * but FW turns the ABORT_REQ into a regular one and so we get + * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. + */ + if (!sk) + goto discard; + + process_cpl_msg_ref(process_abort_rpl, sk, skb); + return 0; +} + +/* + * Handler for CLOSE_CON_RPL CPL messages. + */ +static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct sock *sk = (struct sock *)ctx; + + VALIDATE_SOCK(sk); + + process_cpl_msg_ref(process_close_con_rpl, sk, skb); + return 0; +} + +/* + * Definitions and declarations for CPL message processing. + * ======================================================== + */ + +static void make_established(struct sock *, u32, unsigned int); +static void fixup_and_send_ofo(struct sock *); +static void fixup_pending_writeq_buffers(struct sock *); +static void assign_rxopt(struct sock *, unsigned int); +static void t3_release_offload_resources(struct sock *); +static void act_open_retry_timer(unsigned long); +static void connection_done(struct sock *); +static void mk_act_open_req(struct sock *, struct sk_buff *, + unsigned int, const struct l2t_entry *); +static int act_open_rpl_status_to_errno(int); +static void handle_excess_rx(struct sock *, struct sk_buff *); +static void enter_timewait(struct sock *); +static int abort_status_to_errno(struct sock *, int, int *); +static void send_abort_rpl(struct sk_buff *, struct t3cdev *, int); +static struct sk_buff *get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t); +static void t3_defer_reply(struct sk_buff *, struct t3cdev *, defer_handler_t); +static void send_deferred_abort_rpl(struct t3cdev *, struct sk_buff *); + +/* + * Dequeue and return the first unacknowledged's WR on a socket's pending list. + */ +static inline struct sk_buff *dequeue_wr(struct tcp_sock *tp) +{ + struct sk_buff *skb = tp->forward_skb_hint; + + if (likely(skb)) { + /* Don't bother clearing the tail */ + tp->forward_skb_hint = (struct sk_buff *)skb->sp; + skb->sp = NULL; + } + return skb; +} + +/* + * Return the first pending WR without removing it from the list. + */ +static inline struct sk_buff *peek_wr(const struct tcp_sock *tp) +{ + return tp->forward_skb_hint; +} + +static inline void free_wr_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static void purge_wr_queue(struct tcp_sock *tp) +{ + struct sk_buff *skb; + while ((skb = dequeue_wr(tp)) != NULL) + free_wr_skb(skb); +} + +static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid, + int cmd) +{ + struct cpl_abort_rpl *rpl = cplhdr(skb); + + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); + rpl->cmd = cmd; +} + +/* + * CPL message processing ... + * ========================== + */ + +/* + * Updates socket state from an active establish CPL message. Runs with the + * socket lock held. + */ +static void sock_act_establish(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_act_establish *req = cplhdr(skb); + u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(sk->sk_state != TCP_SYN_SENT)) + printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n", + C3CN_TID(sk), sk->sk_state); + + tp->rcv_tstamp = tcp_time_stamp; + C3CN_DELAK_SEQ(sk) = tp->copied_seq = tp->rcv_wup = tp->rcv_nxt = + rcv_isn; + make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + +#ifdef CONFIG_SECURITY_NETWORK + security_inet_conn_established(sk, tcphdr_skb); +#endif + + /* + * Now that we finally have a TID send any CPL messages that we had to + * defer for lack of a TID. + */ + if (skb_queue_len(&tp->out_of_order_queue)) + fixup_and_send_ofo(sk); + + if (likely(!sock_flag(sk, SOCK_DEAD))) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + __kfree_skb(skb); + + /* + * Currently the send queue must be empty at this point because the + * socket layer does not send anything before a connection is + * established. To be future proof though we handle the possibility + * that there are pending buffers to send (either TX_DATA or + * CLOSE_CON_REQ). First we need to adjust the sequence number of the + * buffers according to the just learned write_seq, and then we send + * them on their way. + */ + fixup_pending_writeq_buffers(sk); + if (t3_push_frames(sk, 1)) + sk->sk_write_space(sk); +} + +/* + * Handle active open failures. + */ +static void active_open_failed(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_act_open_rpl *rpl = cplhdr(skb); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (rpl->status == CPL_ERR_CONN_EXIST && + icsk->icsk_retransmit_timer.function != act_open_retry_timer) { + icsk->icsk_retransmit_timer.function = act_open_retry_timer; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + jiffies + HZ / 2); + } else + fail_act_open(sk, act_open_rpl_status_to_errno(rpl->status)); + __kfree_skb(skb); +} + +/* + * Process received pdu for a connection. + */ +static void process_rx_iscsi_hdr(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb); + struct cpl_iscsi_hdr_norss data_cpl; + struct cpl_rx_data_ddp_norss ddp_cpl; + unsigned int hdr_len, data_len, status; + unsigned int len; + int err; + + if (unlikely(sk_no_receive(sk))) { + handle_excess_rx(sk, skb); + return; + } + + CXGB3_TCP_SKB_CB(skb)->seq = ntohl(hdr_cpl->seq); + CXGB3_TCP_SKB_CB(skb)->flags = 0; + +#if VALIDATE_SEQ + if (unlikely(CXGB3_TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { + printk(KERN_ERR "%s: TID %u: Bad seq %u, expected %u\n", + C3CN_CDEV(sk)->name, C3CN_TID(sk), + CXGB3_TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + goto done; + } +#endif + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(struct cpl_iscsi_hdr)); + + len = hdr_len = ntohs(hdr_cpl->len); + /* msg coalesce is off or not enough data received */ + if (skb->len <= hdr_len) { + printk(KERN_ERR "%s: TID %u, ISCSI_HDR, skb len %u < %u.\n", + C3CN_CDEV(sk)->name, C3CN_TID(sk), skb->len, hdr_len); + goto abort_conn; + } + + err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl, + sizeof(ddp_cpl)); + if (err < 0) + goto abort_conn; + + skb_ulp_mode(skb) = ULP2_FLAG_DATA_READY; + skb_ulp_pdulen(skb) = ntohs(ddp_cpl.len); + skb_ulp_ddigest(skb) = ntohl(ddp_cpl.ulp_crc); + status = ntohl(ddp_cpl.ddp_status); + + if (status & (1 << RX_DDP_STATUS_HCRC_SHIFT)) + skb_ulp_mode(skb) |= ULP2_FLAG_HCRC_ERROR; + if (status & (1 << RX_DDP_STATUS_DCRC_SHIFT)) + skb_ulp_mode(skb) |= ULP2_FLAG_DCRC_ERROR; + if (status & (1 << RX_DDP_STATUS_PAD_SHIFT)) + skb_ulp_mode(skb) |= ULP2_FLAG_PAD_ERROR; + + if (skb->len > (hdr_len + sizeof(ddp_cpl))) { + err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl)); + if (err < 0) + goto abort_conn; + data_len = ntohs(data_cpl.len); + len += sizeof(data_cpl) + data_len; + } else if (status & (1 << RX_DDP_STATUS_DDP_SHIFT)) + skb_ulp_mode(skb) |= ULP2_FLAG_DATA_DDPED; + + tcp_sk(sk)->rcv_nxt = ntohl(ddp_cpl.seq) + skb_ulp_pdulen(skb); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; + __pskb_trim(skb, len); + __skb_queue_tail(&sk->sk_receive_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + return; + +abort_conn: + t3_send_reset(sk, CPL_ABORT_SEND_RST, NULL); +done: + __kfree_skb(skb); +} + +/* + * Process an acknowledgment of WR completion. Advance snd_una and send the + * next batch of work requests from the write queue. + */ +static void wr_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct cpl_wr_ack *hdr = cplhdr(skb); + unsigned int credits = ntohs(hdr->credits); + u32 snd_una = ntohl(hdr->snd_una); + + C3CN_WR_AVAIL(sk) += credits; + if (C3CN_WR_UNACKED(sk) > C3CN_WR_MAX(sk) - C3CN_WR_AVAIL(sk)) + C3CN_WR_UNACKED(sk) = C3CN_WR_MAX(sk) - C3CN_WR_AVAIL(sk); + + while (credits) { + struct sk_buff *p = peek_wr(tp); + + if (unlikely(!p)) { + printk(KERN_ERR "%u WR_ACK credits for TID %u with " + "nothing pending, state %u\n", + credits, C3CN_TID(sk), sk->sk_state); + break; + } + if (unlikely(credits < p->csum)) { + p->csum -= credits; + break; + } else { + dequeue_wr(tp); + credits -= p->csum; + free_wr_skb(p); + } + } + + if (unlikely(before(snd_una, tp->snd_una))) { +#if VALIDATE_SEQ + struct t3cdev *cdev = C3CN_CDEV(sk); + + printk(KERN_ERR "%s: unexpected sequence # %u in WR_ACK " + "for TID %u, snd_una %u\n", cdev->name, snd_una, + C3CN_TID(sk), tp->snd_una); +#endif + goto out_free; + } + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + dst_confirm(sk->sk_dst_cache); + tp->rcv_tstamp = tcp_time_stamp; + if (tp->snd_una == tp->snd_nxt) + c3cn_reset_flag(sk, C3CN_TX_WAIT_IDLE); + } + + if (skb_queue_len(&sk->sk_write_queue) && t3_push_frames(sk, 0)) + sk->sk_write_space(sk); +out_free: + __kfree_skb(skb); +} + +/* + * Handle a peer FIN. + */ +static void do_peer_fin(struct sock *sk, struct sk_buff *skb) +{ + int keep = 0, dead = sock_flag(sk, SOCK_DEAD); + + if (!is_t3a(C3CN_CDEV(sk)) && c3cn_flag(sk, C3CN_ABORT_RPL_PENDING)) + goto out; + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + tcp_set_state(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* + * If we've sent an abort_req we must have sent it too late, + * HW will send us a reply telling us so, and this peer_close + * is really the last message for this connection and needs to + * be treated as an abort_rpl, i.e., transition the connection + * to TCP_CLOSE (note that the host stack does this at the + * time of generating the RST but we must wait for HW). + * Otherwise we enter TIME_WAIT. + */ + t3_release_offload_resources(sk); + if (c3cn_flag(sk, C3CN_ABORT_RPL_PENDING)) + connection_done(sk); + else + enter_timewait(sk); + break; + default: + printk(KERN_ERR + "%s: TID %u received PEER_CLOSE in bad state %d\n", + C3CN_CDEV(sk)->name, C3CN_TID(sk), sk->sk_state); + } + + if (!dead) { + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); + } +out: + if (!keep) + __kfree_skb(skb); +} + +/* + * Process abort requests. If we are waiting for an ABORT_RPL we ignore this + * request except that we need to reply to it. + */ +static void process_abort_req(struct sock *sk, struct sk_buff *skb) +{ + int rst_status = CPL_ABORT_NO_RST; + const struct cpl_abort_req_rss *req = cplhdr(skb); + + if (!c3cn_flag(sk, C3CN_ABORT_REQ_RCVD)) { + c3cn_set_flag(sk, C3CN_ABORT_REQ_RCVD); + c3cn_set_flag(sk, C3CN_ABORT_SHUTDOWN); + __kfree_skb(skb); + return; + } + c3cn_reset_flag(sk, C3CN_ABORT_REQ_RCVD); + + /* + * Three cases to consider: + * a) We haven't sent an abort_req; close the connection. + * b) We have sent a post-close abort_req that will get to TP too late + * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will + * be ignored and the connection should be closed now. + * c) We have sent a regular abort_req that will get to TP too late. + * That will generate an abort_rpl with status 0, wait for it. + */ + if (!c3cn_flag(sk, C3CN_ABORT_RPL_PENDING) + || (is_t3a(C3CN_CDEV(sk)) + && c3cn_flag(sk, C3CN_CLOSE_CON_REQUESTED))) { + sk->sk_err = + abort_status_to_errno(sk, req->status, &rst_status); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + t3_release_offload_resources(sk); + connection_done(sk); + } + + send_abort_rpl(skb, BLOG_SKB_CB(skb)->cdev, rst_status); +} + +/* + * Process abort replies. We only process these messages if we anticipate + * them as the coordination between SW and HW in this area is somewhat lacking + * and sometimes we get ABORT_RPLs after we are done with the connection that + * originated the ABORT_REQ. + */ +static void process_abort_rpl(struct sock *sk, struct sk_buff *skb) +{ + if (c3cn_flag(sk, C3CN_ABORT_RPL_PENDING)) { + if (!c3cn_flag(sk, C3CN_ABORT_RPL_RCVD) + && !is_t3a(C3CN_CDEV(sk))) + c3cn_set_flag(sk, C3CN_ABORT_RPL_RCVD); + else { + c3cn_reset_flag(sk, C3CN_ABORT_RPL_RCVD); + c3cn_reset_flag(sk, C3CN_ABORT_RPL_PENDING); + if (!c3cn_flag(sk, C3CN_ABORT_REQ_RCVD) || + !is_t3a(C3CN_CDEV(sk))) { + BUG_ON(c3cn_flag(sk, C3CN_ABORT_REQ_RCVD)); + t3_release_offload_resources(sk); + connection_done(sk); + } + } + } + __kfree_skb(skb); +} + +/* + * Process a peer ACK to our FIN. + */ +static void process_close_con_rpl(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct cpl_close_con_rpl *rpl = cplhdr(skb); + + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ + + if (!is_t3a(C3CN_CDEV(sk)) && c3cn_flag(sk, C3CN_ABORT_RPL_PENDING)) + goto out; + + switch (sk->sk_state) { + case TCP_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ + t3_release_offload_resources(sk); + if (c3cn_flag(sk, C3CN_ABORT_RPL_PENDING)) + connection_done(sk); + else + enter_timewait(sk); + break; + case TCP_LAST_ACK: + /* + * In this state we don't care about pending abort_rpl. + * If we've sent abort_req it was post-close and was sent too + * late, this close_con_rpl is the actual last message. + */ + t3_release_offload_resources(sk); + connection_done(sk); + break; + case TCP_FIN_WAIT1: + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + dst_confirm(sk->sk_dst_cache); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else if (tcp_sk(sk)->linger2 < 0 && + !c3cn_flag(sk, C3CN_ABORT_SHUTDOWN)) + abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER); + break; + default: + printk(KERN_ERR + "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", + C3CN_CDEV(sk)->name, C3CN_TID(sk), sk->sk_state); + } +out: + kfree_skb(skb); +} + +/* + * Random utility functions for CPL message processing ... + * ======================================================= + */ + +/** + * find_best_mtu - find the entry in the MTU table closest to an MTU + * @d: TOM state + * @mtu: the target MTU + * + * Returns the index of the value in the MTU table that is closest to but + * does not exceed the target MTU. + */ +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return i; +} + +static unsigned int select_mss(struct sock *sk, unsigned int pmtu) +{ + unsigned int idx; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct t3cdev *cdev = C3CN_CDEV(sk); + const struct t3c_data *td = T3C_DATA(cdev); + + tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->advmss > tp->rx_opt.user_mss) + tp->advmss = tp->rx_opt.user_mss; + if (tp->advmss > pmtu - 40) + tp->advmss = pmtu - 40; + if (tp->advmss < td->mtus[0] - 40) + tp->advmss = td->mtus[0] - 40; + idx = find_best_mtu(td, tp->advmss + 40); + tp->advmss = td->mtus[idx] - 40; + inet_csk(sk)->icsk_pmtu_cookie = pmtu; + return idx; +} + +/* + * Determine the receive window size for a socket. + */ +static unsigned int select_rcv_wnd(struct sock *sk) +{ + struct t3cdev *cdev = C3CN_CDEV(sk); + struct cxgb3i_tcp_data *cdata = CXGB3_TCP_DATA(cdev); + unsigned int wnd = tcp_full_space(sk); + unsigned int max_rcv_wnd; + + /* + * For receive coalescing to work effectively we need a receive window + * that can accomodate a coalesced segment. + */ + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + max_rcv_wnd = (cdev->type < T3C + ? (u32) cdata->rx_page_size * 23 : MAX_RCV_WND); + + return min(wnd, max_rcv_wnd); +} + +static void fail_act_open(struct sock *sk, int errno) +{ + sk->sk_err = errno; + sk->sk_error_report(sk); + t3_release_offload_resources(sk); + connection_done(sk); + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); +} + +static void pivot_ca_ops(struct sock *sk, int cong) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); + icsk->icsk_ca_ops = &t3_cong_ops[cong < 0 ? 2 : cong]; +} + +/* + * Assign offload parameters to some socket fields. This code is used by + * both active and passive opens. + */ +static void init_offload_sk(struct sock *sk, struct t3cdev *cdev, + struct dst_entry *dst) +{ + struct tcp_sock *tp = tcp_sk(sk); + + BUG_ON(C3CN_CDEV(sk) != cdev); + C3CN_WR_MAX(sk) = C3CN_WR_AVAIL(sk) = CXGB3_TCP_TUNABLE(cdev, max_wrs); + C3CN_WR_UNACKED(sk) = 0; + C3CN_DELAK_MODE(sk) = 0; + C3CN_MSS_IDX(sk) = select_mss(sk, dst_mtu(dst)); + tp->rcv_wnd = select_rcv_wnd(sk); + + C3CN_CTRL_SKB_CACHE(sk) = alloc_skb(CTRL_SKB_LEN, gfp_any()); + reset_wr_list(tp); + + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + pivot_ca_ops(sk, CXGB3_TCP_TUNABLE(cdev, cong_alg)); +} + +/* + * Returns whether a CPL message is not expected in the socket backlog of a + * closed connection. Most messages are illegal at that point except + * ABORT_RPL_RSS and GET_TCB_RPL sent by DDP. + */ +static int bad_backlog_msg(unsigned int opcode) +{ + return opcode != CPL_ABORT_RPL_RSS && opcode != CPL_GET_TCB_RPL; +} + +/* + * Called for each sk_buff in a socket's receive backlog during + * backlog processing. + */ +static int t3_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ +#if VALIDATE_TID + unsigned int opcode = ntohl(skb->csum) >> 24; + + if (unlikely(sk->sk_state == TCP_CLOSE && bad_backlog_msg(opcode))) { + printk(KERN_ERR "unexpected CPL message with opcode %x for " + "closed TID %u\n", opcode, C3CN_TID(sk)); + kfree_skb(skb); + return 0; + } +#endif + + BLOG_SKB_CB(skb)->backlog_rcv(sk, skb); + return 0; +} + +/* + * TCP socket write_space callback. Follows sk_stream_write_space(). + */ +static void t3_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (chelsio_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + } +} + +static void act_open_retry_timer(unsigned long data) +{ + struct sk_buff *skb; + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) /* try in a bit */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + jiffies + HZ / 20); + else { + skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC); + if (!skb) + fail_act_open(sk, ENOMEM); + else { + skb->sk = sk; + set_arp_failure_handler(skb, act_open_req_arp_failure); + mk_act_open_req(sk, skb, C3CN_TID(sk), C3CN_L2T(sk)); + l2t_send(C3CN_CDEV(sk), skb, C3CN_L2T(sk)); + } + } + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Called when we receive the last message from HW for a connection. A + * connection cannot transition to TCP_CLOSE prior to this event. + * Resources related to the offload state of a connection (e.g., L2T entries) + * must have been relinquished prior to calling this. + */ +static void connection_done(struct sock *sk) +{ + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_all(sk->sk_sleep); + + tcp_done(sk); +} + +/* + * Convert an ACT_OPEN_RPL status to a Linux errno. + */ +static int act_open_rpl_status_to_errno(int status) +{ + switch (status) { + case CPL_ERR_CONN_RESET: + return ECONNREFUSED; + case CPL_ERR_ARP_MISS: + return EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return ENOMEM; + case CPL_ERR_CONN_EXIST: + printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return EADDRINUSE; + default: + return EIO; + } +} + +/* + * Adapted from tcp_minisocks.c + */ + +void tcp_time_wait(struct sock *sk, int state, int timeo) +{ + struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + int recycle_ok = 0; + + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->tw_timeout = rto; + } else { + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + inet_twsk_schedule(tw, &tcp_death_row, timeo, TCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + if (net_ratelimit()) + printk(KERN_INFO + "TCP: time wait bucket table overflow\n"); + } + + tcp_done(sk); +} + +/* + * Move a socket to TIME_WAIT state. We need to make some adjustments to the + * socket state before calling tcp_time_wait to comply with its expectations. + */ +static void enter_timewait(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* + * Bump rcv_nxt for the peer FIN. We don't do this at the time we + * process peer_close because we don't want to carry the peer FIN in + * the socket's receive queue and if we increment rcv_nxt without + * having the FIN in the receive queue we'll confuse facilities such + * as SIOCINQ. + */ + tp->rcv_nxt++; + + tp->rx_opt.ts_recent_stamp = 0; /* defeat recycling */ + tp->srtt = 0; /* defeat tcp_update_metrics */ + tcp_time_wait(sk, TCP_TIME_WAIT, 0); /* calls tcp_done */ +} + +/* + * Convert the status code of an ABORT_REQ into a Linux error code. Also + * indicate whether RST should be sent in response. + */ +static int abort_status_to_errno(struct sock *sk, int abort_reason, + int *need_rst) +{ + switch (abort_reason) { + case CPL_ERR_BAD_SYN: + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); /* fall through */ + case CPL_ERR_CONN_RESET: + return sk->sk_state == TCP_CLOSE_WAIT ? EPIPE : ECONNRESET; + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); + return ETIMEDOUT; + default: + return EIO; + } +} + +static void send_abort_rpl(struct sk_buff *skb, struct t3cdev *cdev, + int rst_status) +{ + struct sk_buff *reply_skb; + struct cpl_abort_req_rss *req = cplhdr(skb); + + reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl), + gfp_any()); + if (!reply_skb) { + /* Defer the reply. Stick rst_status into req->cmd. */ + req->status = rst_status; + t3_defer_reply(skb, cdev, send_deferred_abort_rpl); + return; + } + + reply_skb->priority = CPL_PRIORITY_DATA; + set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status); + kfree_skb(skb); + cxgb3_ofld_send(cdev, reply_skb); +} + +/* + * Returns an sk_buff for a reply CPL message of size len. If the input + * sk_buff has no other users it is trimmed and reused, otherwise a new buffer + * is allocated. The input skb must be of size at least len. Note that this + * operation does not destroy the original skb data even if it decides to reuse + * the buffer. + */ +static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len, + gfp_t gfp) +{ + if (likely(!skb_cloned(skb))) { + BUG_ON(skb->len < len); + __skb_trim(skb, len); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + if (skb) + __skb_put(skb, len); + } + return skb; +} + +/* + * Add an skb to the deferred skb queue for processing from process context. + */ +static void t3_defer_reply(struct sk_buff *skb, struct t3cdev *cdev, + defer_handler_t handler) +{ + struct cxgb3i_tcp_data *cdata = CXGB3_TCP_DATA(cdev); + + DEFERRED_SKB_CB(skb)->handler = handler; + spin_lock_bh(&cdata->deferq.lock); + __skb_queue_tail(&cdata->deferq, skb); + if (skb_queue_len(&cdata->deferq) == 1) + schedule_work(&cdata->deferq_task); + spin_unlock_bh(&cdata->deferq.lock); +} + +/* + * Process the defer queue. + */ +static void process_deferq(struct work_struct *task_param) +{ + struct sk_buff *skb; + struct cxgb3i_tcp_data *cdata = container_of(task_param, + struct cxgb3i_tcp_data, + deferq_task); + + spin_lock_bh(&cdata->deferq.lock); + while ((skb = __skb_dequeue(&cdata->deferq)) != NULL) { + spin_unlock_bh(&cdata->deferq.lock); + DEFERRED_SKB_CB(skb)->handler(cdata->cdev, skb); + spin_lock_bh(&cdata->deferq.lock); + } + spin_unlock_bh(&cdata->deferq.lock); +} + +static void send_deferred_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb) +{ + struct sk_buff *reply_skb; + struct cpl_abort_req_rss *req = cplhdr(skb); + + reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl), + GFP_KERNEL | __GFP_NOFAIL); + reply_skb->priority = CPL_PRIORITY_DATA; + __skb_put(reply_skb, sizeof(struct cpl_abort_rpl)); + set_abort_rpl_wr(reply_skb, GET_TID(req), req->status); + cxgb3_ofld_send(cdev, reply_skb); + kfree_skb(skb); +} + +/* + * Release resources held by an offload connection (TID, L2T entry, etc.) + */ +static void t3_release_offload_resources(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct t3cdev *cdev = C3CN_CDEV(sk); + unsigned int tid = C3CN_TID(sk); + + if (!cdev) + return; + + C3CN_QSET(sk) = 0; + + kfree_skb(C3CN_CTRL_SKB_CACHE(sk)); + C3CN_CTRL_SKB_CACHE(sk) = NULL; + + if (C3CN_WR_AVAIL(sk) != C3CN_WR_MAX(sk)) { + purge_wr_queue(tp); + reset_wr_list(tp); + } + + if (C3CN_L2T(sk)) { + l2t_release(L2DATA(cdev), C3CN_L2T(sk)); + C3CN_L2T(sk) = NULL; + } + + if (sk->sk_state == TCP_SYN_SENT) { /* we have ATID */ + free_atid(cdev, tid); + __skb_queue_purge(&tp->out_of_order_queue); + } else { /* we have TID */ + cxgb3_remove_tid(cdev, (void *)sk, tid); + sock_put(sk); + } + + t3_set_ca_ops(sk, &tcp_init_congestion_ops); + C3CN_CDEV(sk) = NULL; +} + +/* + * Handles Rx data that arrives in a state where the socket isn't accepting + * new data. + */ +static void handle_excess_rx(struct sock *sk, struct sk_buff *skb) +{ + if (need_rst_on_excess_rx(sk) && !c3cn_flag(sk, C3CN_ABORT_SHUTDOWN)) + abort_conn(sk, skb, LINUX_MIB_TCPABORTONDATA); + + kfree_skb(skb); /* can't use __kfree_skb here */ +} + +/* + * Like get_cpl_reply_skb() but the returned buffer starts out empty. + */ +static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len, + gfp_t gfp) +{ + if (likely(!skb_cloned(skb) && !skb->data_len)) { + __skb_trim(skb, 0); + skb_get(skb); + } else + skb = alloc_skb(len, gfp); + return skb; +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCP_ESTABLISHED. + * + * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. + */ +static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->pushed_seq = tp->write_seq = tp->snd_nxt = tp->snd_una = snd_isn; + inet_sk(sk)->id = tp->write_seq ^ jiffies; + assign_rxopt(sk, opt); + + /* + * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't + * pass through opt0. + */ + if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) + tp->rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); + + dst_confirm(sk->sk_dst_cache); + + /* + * tcp_poll() does not lock socket, make sure initial values are + * committed before changing to ESTABLISHED. + */ + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); +} + +/* + * Fill in the right TID for CPL messages waiting in the out-of-order queue + * and send them to the TOE. + */ +static void fixup_and_send_ofo(struct sock *sk) +{ + struct sk_buff *skb; + struct t3cdev *cdev = C3CN_CDEV(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned int tid = C3CN_TID(sk); + + while ((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) { + /* + * A variety of messages can be waiting but the fields we'll + * be touching are common to all so any message type will do. + */ + struct cpl_close_con_req *p = cplhdr(skb); + + p->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); + cxgb3_ofld_send(cdev, skb); + } +} + +/* + * Adjust buffers already in write queue after a SYN_SENT->ESTABLISHED + * transition. For TX_DATA we need to adjust the start sequence numbers, and + * for other packets we need to adjust the TID. TX_DATA packets don't have + * headers yet and so not TIDs. + */ +static void fixup_pending_writeq_buffers(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + unsigned int tid = C3CN_TID(sk); + + skb_queue_walk(&sk->sk_write_queue, skb) { + if (CXGB3_TCP_SKB_CB(skb)->flags & C3CB_FLAG_NEED_HDR) { + CXGB3_TCP_SKB_CB(skb)->seq = tp->write_seq; + tp->write_seq += skb->len + ulp_extra_len(skb); + } else { + struct cpl_close_con_req *p = cplhdr(skb); + + p->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); + } + } +} + +/* + * Called when a connection is established to translate the TCP options + * reported by HW to Linux's native format. + */ +static void assign_rxopt(struct sock *sk, unsigned int opt) +{ + const struct t3c_data *td = T3C_DATA(C3CN_CDEV(sk)); + struct tcp_sock *tp = tcp_sk(sk); + + tp->rx_opt.mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; + tp->mss_cache = tp->rx_opt.mss_clamp; + tp->tcp_header_len = sizeof(struct tcphdr); + tp->rx_opt.tstamp_ok = G_TCPOPT_TSTAMP(opt); + tp->rx_opt.sack_ok = G_TCPOPT_SACK(opt); + tp->rx_opt.wscale_ok = G_TCPOPT_WSCALE_OK(opt); + tp->rx_opt.snd_wscale = G_TCPOPT_SND_WSCALE(opt); + if (!tp->rx_opt.wscale_ok) + tp->rx_opt.rcv_wscale = 0; + if (tp->rx_opt.tstamp_ok) { + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + tp->mss_cache -= TCPOLEN_TSTAMP_ALIGNED; + } +} diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.h b/drivers/scsi/cxgb3i/cxgb3i_offload.h new file mode 100644 index 0000000..e4b34c7 --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_offload.h @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2003-2008 Chelsio Communications. All rights reserved. + * + * Written by Dimitris Michailidis (dm@xxxxxxxxxxx) + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this + * release for licensing terms and conditions. + */ + +#ifndef _CXGB3I_OFFLOAD_H +#define _CXGB3I_OFFLOAD_H + +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include "t3cdev.h" +#include "cxgb3_offload.h" + +/* + * Data structure to keep track of cxgb3 connection. Linked off of the + * (struct sock *). + */ +struct cxgb3i_tcp_conn { + struct net_device *dev; + struct t3cdev *cdev; + unsigned long flags; + int tid; + int qset; + int mss_idx; + struct l2t_entry *l2t; + int ulp_mode; + int delack_mode; + int delack_seq; + int wr_max; + int wr_avail; + int wr_unacked; + struct sk_buff *ctrl_skb_cache; + + /* + * Upper Layer Protocol (ULP) state. These could probably go into a + * union based on mutually independent ULP modes but so far it's not + * worth the effort. + */ +}; +#define CXGB3_TCP_CONN(sk) \ + (*(struct cxgb3i_tcp_conn **)&(sk)->sk_protinfo) + +#define C3CN_CDEV(sk) (CXGB3_TCP_CONN(sk)->cdev) +#define C3CN_TID(sk) (CXGB3_TCP_CONN(sk)->tid) +#define C3CN_QSET(sk) (CXGB3_TCP_CONN(sk)->qset) +#define C3CN_MSS_IDX(sk) (CXGB3_TCP_CONN(sk)->mss_idx) +#define C3CN_L2T(sk) (CXGB3_TCP_CONN(sk)->l2t) +#define C3CN_ULP_MODE(sk) (CXGB3_TCP_CONN(sk)->ulp_mode) +#define C3CN_DELAK_MODE(sk) (CXGB3_TCP_CONN(sk)->delack_mode) +#define C3CN_DELAK_SEQ(sk) (CXGB3_TCP_CONN(sk)->delack_seq) +#define C3CN_WR_MAX(sk) (CXGB3_TCP_CONN(sk)->wr_max) +#define C3CN_WR_AVAIL(sk) (CXGB3_TCP_CONN(sk)->wr_avail) +#define C3CN_WR_UNACKED(sk) (CXGB3_TCP_CONN(sk)->wr_unacked) +#define C3CN_CTRL_SKB_CACHE(sk) (CXGB3_TCP_CONN(sk)->ctrl_skb_cache) + +/* + * Connection flags -- many to track some close related events. + */ +enum c3cn_flags { + C3CN_OFFLOADED, /* connection offloaded */ + C3CN_ABORT_RPL_RCVD, /* received one ABORT_RPL_RSS message */ + C3CN_ABORT_REQ_RCVD, /* received one ABORT_REQ_RSS message */ + C3CN_TX_MORE_DATA, /* don't set the SHOVE bit */ + C3CN_TX_WAIT_IDLE, /* suspend Tx until in-flight data is ACKed */ + C3CN_ABORT_SHUTDOWN, /* shouldn't send more abort requests */ + C3CN_ABORT_RPL_PENDING, /* expecting an abort reply */ + C3CN_CLOSE_CON_REQUESTED, /* we've sent a close_conn_req */ + C3CN_TX_DATA_SENT, /* already sent a TX_DATA WR */ + C3CN_TX_FAILOVER /* Tx traffic failing over */ +}; + +static inline void c3cn_set_flag(struct sock *sk, enum c3cn_flags flag) +{ + __set_bit(flag, &CXGB3_TCP_CONN(sk)->flags); +} + +static inline void c3cn_reset_flag(struct sock *sk, enum c3cn_flags flag) +{ + __clear_bit(flag, &CXGB3_TCP_CONN(sk)->flags); +} + +static inline int c3cn_flag(struct sock *sk, enum c3cn_flags flag) +{ + struct cxgb3i_tcp_conn *c3cn = CXGB3_TCP_CONN(sk); + + if (c3cn == NULL) + return 0; + return test_bit(flag, &CXGB3_TCP_CONN(sk)->flags); +} + +/* + * "Tunables" for each t3cdev. + */ +struct cxgb3i_tcp_tunables { + int max_host_sndbuf; /* max host RAM consumed by a sndbuf */ + int max_wrs; /* max # of outstanding WRs per connection */ + int rx_credit_thres; /* min # of RX credits needed for RX_DATA_ACK */ + int cong_alg; /* Congestion control algorithm */ + int delack; /* delayed ACK control */ + int tcp_window_scaling; +}; + +/* + * Per adapter data. Linked off of each Ethernet device port on the adapter. + * Also available via the t3cdev structure since we have pointers to our port + * net_device's there ... + */ +struct cxgb3i_tcp_data { + struct list_head list; + struct t3cdev *cdev; + struct cxgb3_client *client; + struct adap_ports *ports; + struct cxgb3i_tcp_tunables conf; + unsigned int rx_page_size; + struct sk_buff_head deferq; + struct work_struct deferq_task; +}; +#define NDEV2CDATA(ndev) (*(struct cxgb3i_tcp_data **)&(ndev)->ec_ptr) +#define CXGB3_TCP_DATA(cdev) NDEV2CDATA((cdev)->lldev) +#define CXGB3_TCP_TUNABLE(cdev, param) ((CXGB3_TCP_DATA(cdev))->conf.param) + +/* + * Primary API routines. + */ + +int cxgb3i_tcp_init(cxgb3_cpl_handler_func *); +void cxgb3i_tcp_add(struct t3cdev *, struct cxgb3_client *); +void cxgb3i_tcp_remove(struct t3cdev *); + +int cxgb3i_tcp_connect(struct socket *, struct sockaddr *, int, int); +void cxgb3i_tcp_cleanup_rbuf(struct sock *, int); +int cxgb3i_tcp_sendskb(struct sock *, struct sk_buff *, int); + +/* + * Offload type IDs. + */ +enum { + TOE_ID_CHELSIO_T1 = 1, + TOE_ID_CHELSIO_T1C, + TOE_ID_CHELSIO_T2, + TOE_ID_CHELSIO_T3, + TOE_ID_CHELSIO_T3B, + TOE_ID_CHELSIO_T3C, +}; + +/* + * Definitions for sk_buff state and ULP mode management. + */ + +struct cxgb3_skb_cb { + __u8 flags; /* TCP-like flags */ + __u8 ulp_mode; /* ULP mode/submode of sk_buff */ + __u32 seq; /* TCP sequence number */ + union { /* ULP-specific fields */ + struct { + __u32 ddigest; /* ULP rx_data_ddp selected field */ + __u32 pdulen; /* ULP rx_data_ddp selected field */ + } iscsi; + } ulp; + __u8 ulp_data[16]; /* scratch area for ULP */ +}; + +#define CXGB3_TCP_SKB_CB(skb) ((struct cxgb3_skb_cb *)&((skb)->cb[0])) + +#define skb_ulp_mode(skb) (CXGB3_TCP_SKB_CB(skb)->ulp_mode) +#define skb_ulp_ddigest(skb) (CXGB3_TCP_SKB_CB(skb)->ulp.iscsi.ddigest) +#define skb_ulp_pdulen(skb) (CXGB3_TCP_SKB_CB(skb)->ulp.iscsi.pdulen) +#define skb_ulp_data(skb) (CXGB3_TCP_SKB_CB(skb)->ulp_data) + +#define skb_ulp_lhdr(sk) (C3CN_ULP_LHDR(sk)) + +enum { + C3CB_FLAG_NEED_HDR = 1 << 0, /* packet needs a TX_DATA_WR header */ + C3CB_FLAG_NO_APPEND = 1 << 1, /* don't grow this skb */ + C3CB_FLAG_BARRIER = 1 << 2, /* set TX_WAIT_IDLE after sending */ + C3CB_FLAG_HOLD = 1 << 3, /* skb not ready for Tx yet */ + C3CB_FLAG_COMPL = 1 << 4, /* request WR completion */ + C3CB_FLAG_URG = 1 << 5, /* TCP URG */ +}; + +/* + * Definitions for managing deferred CPL replies from process context. + */ + +typedef void (*defer_handler_t) (struct t3cdev *, struct sk_buff *); + +struct deferred_skb_cb { + defer_handler_t handler; + struct t3cdev *cdev; +}; + +#define DEFERRED_SKB_CB(skb) ((struct deferred_skb_cb *)(skb)->cb) + +/* + * Definitions for socket backlog processing ... + */ + +/* Per-skb backlog handler. Run when a socket's backlog is processed. */ +struct blog_skb_cb { + void (*backlog_rcv) (struct sock *, struct sk_buff *); + struct t3cdev *cdev; +}; + +#define BLOG_SKB_CB(skb) ((struct blog_skb_cb *)(skb)->cb) + +#define SET_BLOG_CPL_HANDLER(skb, hnd) BLOG_SKB_CB(skb)->backlog_rcv = (hnd) + +#define GL_SKB(skb) (skb)->sp + +/* + * Top-level CPL message processing used by most CPL messages that + * pertain to connections. + */ +static inline void process_cpl_msg(void (*fn)(struct sock *, struct sk_buff *), + struct sock *sk, struct sk_buff *skb) +{ + bh_lock_sock(sk); + if (unlikely(sock_owned_by_user(sk))) { + SET_BLOG_CPL_HANDLER(skb, fn); + sk_add_backlog(sk, skb); + } else + fn(sk, skb); + bh_unlock_sock(sk); +} + +/* + * Opaque version of structure the SGE stores at skb->head of TX_DATA packets + * and for which we must reserve space. + */ +struct sge_opaque_hdr { + void *dev; + dma_addr_t addr[MAX_SKB_FRAGS + 1]; +}; + +/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */ +#define TX_HEADER_LEN \ + (sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr)) + +/* + * Useful utility functions and inlines. + */ + +/* + * Returns true if the socket is in one of the supplied states. + */ +static inline unsigned int sk_in_state(const struct sock *sk, + unsigned int states) +{ + return states & (1 << sk->sk_state); +} + +#endif /* _CXGB3_TCP_H */ diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.c b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c new file mode 100644 index 0000000..2f52930 --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c @@ -0,0 +1,722 @@ +/* + * cxgb3i_ddp.c: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@xxxxxxxxxxx) + */ + +#include <linux/skbuff.h> +#include "cxgb3i.h" +#include "cxgb3i_ulp2.h" + +static struct page *pad_page; + +#define ULP2_PGIDX_MAX 4 +#define ULP2_4K_PAGE_SHIFT 12 +#define ULP2_4K_PAGE_MASK (~((1UL << ULP2_4K_PAGE_SHIFT) - 1)) +static unsigned char ddp_page_order[ULP2_PGIDX_MAX]; +static unsigned long ddp_page_size[ULP2_PGIDX_MAX]; +static unsigned char ddp_page_shift[ULP2_PGIDX_MAX]; +static unsigned char sw_tag_idx_bits; +static unsigned char sw_tag_age_bits; + +static void cxgb3i_ddp_page_init(void) +{ + int i; + unsigned long n = PAGE_SIZE >> ULP2_4K_PAGE_SHIFT; + + if (PAGE_SIZE & (~ULP2_4K_PAGE_MASK)) { + cxgb3i_log_warn("PAGE_SIZE 0x%lx is not multiple of 4K, " + "ddp disabled.\n", PAGE_SIZE); + return; + } + n = __ilog2_u32(n); + for (i = 0; i < ULP2_PGIDX_MAX; i++, n++) { + ddp_page_order[i] = n; + ddp_page_shift[i] = ULP2_4K_PAGE_SHIFT + n; + ddp_page_size[i] = 1 << ddp_page_shift[i]; + cxgb3i_log_debug("%d, order %u, shift %u, size 0x%lx.\n", i, + ddp_page_order[i], ddp_page_shift[i], + ddp_page_size[i]); + } + + sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1; + sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1; +} + +static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr) +{ + struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head; + req->wr.wr_lo = 0; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) | + V_ULPTX_CMD(ULP_MEM_WRITE)); + req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE >> 5) | + V_ULPTX_NFLITS((PPOD_SIZE >> 3) + 1)); +} + +static int set_ddp_map(struct cxgb3i_adapter *snic, struct pagepod_hdr *hdr, + unsigned int idx, unsigned int npods, + struct scatterlist *sgl, unsigned int sgcnt) +{ + struct cxgb3i_ddp_info *ddp = &snic->ddp; + struct scatterlist *sg = sgl; + unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit; + int i; + + for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) { + struct sk_buff *skb; + struct pagepod *ppod; + int j, k; + skb = + alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE, + GFP_ATOMIC); + if (!skb) + return -ENOMEM; + skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE); + + ulp_mem_io_set_hdr(skb, pm_addr); + ppod = + (struct pagepod *)(skb->head + sizeof(struct ulp_mem_io)); + memcpy(&(ppod->hdr), hdr, sizeof(struct pagepod)); + for (j = 0, k = i * 4; j < 5; j++, k++) { + if (k < sgcnt) { + ppod->addr[j] = cpu_to_be64(sg_dma_address(sg)); + if (j < 4) + sg = sg_next(sg); + } else + ppod->addr[j] = 0UL; + } + + skb->priority = CPL_PRIORITY_CONTROL; + cxgb3_ofld_send(snic->tdev, skb); + } + return 0; +} + +static int clear_ddp_map(struct cxgb3i_adapter *snic, unsigned int idx, + unsigned int npods) +{ + struct cxgb3i_ddp_info *ddp = &snic->ddp; + unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit; + int i; + + for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) { + struct sk_buff *skb; + skb = + alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE, + GFP_ATOMIC); + if (!skb) + return -ENOMEM; + skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE); + memset((skb->head + sizeof(struct ulp_mem_io)), 0, PPOD_SIZE); + ulp_mem_io_set_hdr(skb, pm_addr); + skb->priority = CPL_PRIORITY_CONTROL; + cxgb3_ofld_send(snic->tdev, skb); + } + return 0; +} + +static int cxgb3i_ddp_sgl_check(struct scatterlist *sgl, unsigned int sgcnt) +{ + struct scatterlist *sg; + int i; + + /* make sure the sgl is fit for ddp: + * each has the same page size, and + * first & last page do not need to be used completely, and + * the rest of page must be used completely + */ + for_each_sg(sgl, sg, sgcnt, i) { + if ((i && sg->offset) || + ((i != sgcnt - 1) && + (sg->length + sg->offset) != PAGE_SIZE)) + return -EINVAL; + } + + return 0; +} + +static inline int ddp_find_unused_entries(struct cxgb3i_ddp_info *ddp, + int start, int max, int count) +{ + unsigned int i, j; + + spin_lock(&ddp->map_lock); + for (i = start; i <= max;) { + for (j = 0; j < count; j++) { + if (ddp->map[i + j]) + break; + } + if (j == count) { + memset(&ddp->map[i], 1, count); + spin_unlock(&ddp->map_lock); + return i; + } + i += j + 1; + } + spin_unlock(&ddp->map_lock); + return -EBUSY; +} + +static inline void ddp_unmark_entries(struct cxgb3i_ddp_info *ddp, + int start, int count) +{ + spin_lock(&ddp->map_lock); + memset(&ddp->map[start], 0, count); + spin_unlock(&ddp->map_lock); +} + +u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *snic, unsigned int tid, + u32 sw_tag, unsigned int xferlen, + struct scatterlist *sgl, unsigned int sgcnt) +{ + struct cxgb3i_ddp_info *ddp = &snic->ddp; + struct pagepod_hdr hdr; + unsigned int npods; + int idx = -1, idx_max; + u32 tag; + int err; + + if (!ddp || !sgcnt || xferlen < PAGE_SIZE) + return RESERVED_ITT; + + err = cxgb3i_ddp_sgl_check(sgl, sgcnt); + if (err < 0) + return RESERVED_ITT; + + npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT; + idx_max = ddp->nppods - npods + 1; + + if (ddp->idx_last == ddp->nppods) + idx = ddp_find_unused_entries(ddp, 0, idx_max, npods); + else { + idx = ddp_find_unused_entries(ddp, ddp->idx_last + 1, idx_max, + npods); + if ((idx < 0) && (ddp->idx_last >= npods)) + idx = ddp_find_unused_entries(ddp, 0, + ddp->idx_last - npods + 1, + npods); + } + if (idx < 0) + return RESERVED_ITT; + + if (pci_map_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE) <= 0) + goto unmark_entries; + + tag = sw_tag | (idx << snic->tag_format.rsvd_shift); + + hdr.rsvd = 0; + hdr.vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); + hdr.pgsz_tag_clr = htonl(tag); + hdr.maxoffset = htonl(xferlen); + hdr.pgoffset = htonl(sgl->offset); + + if (set_ddp_map(snic, &hdr, idx, npods, sgl, sgcnt) < 0) + goto unmap_sgl; + + ddp->idx_last = idx; + cxgb3i_log_debug("tid 0x%x, xfer %u, 0x%x -> ddp tag 0x%x (%u, %u).\n", + tid, xferlen, sw_tag, tag, idx, npods); + return tag; + +unmap_sgl: + pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE); + +unmark_entries: + ddp_unmark_entries(ddp, idx, npods); + return RESERVED_ITT; +} + +void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *snic, u32 tag, + struct scatterlist *sgl, unsigned int sgcnt) +{ + u32 idx = (tag >> snic->tag_format.rsvd_shift) & + snic->tag_format.rsvd_mask; + unsigned int npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT; + + if (idx < snic->tag_format.rsvd_mask) { + cxgb3i_log_debug("ddp tag 0x%x, release idx 0x%x, npods %u.\n", + tag, idx, npods); + clear_ddp_map(snic, idx, npods); + ddp_unmark_entries(&snic->ddp, idx, npods); + pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE); + } +} + +int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *cconn, int hcrc, int dcrc) +{ + struct iscsi_tcp_conn *tcp_conn = cconn->conn->dd_data; + struct cxgb3i_tcp_conn *c3cn = CXGB3_TCP_CONN(tcp_conn->sock->sk); + struct sk_buff *skb = alloc_skb(sizeof(struct cpl_set_tcb_field), + GFP_KERNEL | __GFP_NOFAIL); + struct cpl_set_tcb_field *req; + u32 submode = (hcrc ? 1 : 0) | (dcrc ? 2 : 0); + + /* set up ulp submode and page size */ + req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, c3cn->tid)); + req->reply = V_NO_REPLY(1); + req->cpu_idx = 0; + req->word = htons(31); + req->mask = cpu_to_be64(0xFF000000); + /* the connection page size is always the same as ddp-pgsz0 */ + req->val = cpu_to_be64(submode << 24); + skb->priority = CPL_PRIORITY_CONTROL; + + cxgb3_ofld_send(c3cn->cdev, skb); + return 0; +} + +static int cxgb3i_conn_read_pdu_skb(struct iscsi_conn *conn, + struct sk_buff *skb) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct iscsi_segment *segment = &tcp_conn->in.segment; + struct iscsi_hdr *hdr = (struct iscsi_hdr *)tcp_conn->in.hdr_buf; + unsigned char *buf = (unsigned char *)hdr; + unsigned int offset = sizeof(struct iscsi_hdr); + int err; + + cxgb3i_log_debug("conn 0x%p, skb 0x%p, len %u, flag 0x%x.\n", + conn, skb, skb->len, skb_ulp_mode(skb)); + + /* read bhs */ + err = skb_copy_bits(skb, 0, buf, sizeof(struct iscsi_hdr)); + if (err < 0) + return err; + segment->copied = sizeof(struct iscsi_hdr); + /* read ahs */ + if (hdr->hlength) { + unsigned int ahslen = hdr->hlength << 2; + /* Make sure we don't overflow */ + if (sizeof(*hdr) + ahslen > sizeof(tcp_conn->in.hdr_buf)) + return -ISCSI_ERR_AHSLEN; + err = skb_copy_bits(skb, offset, buf + offset, ahslen); + if (err < 0) + return err; + offset += ahslen; + } + /* header digest */ + if (conn->hdrdgst_en) + offset += ISCSI_DIGEST_SIZE; + + /* check header digest */ + segment->status = (conn->hdrdgst_en && + (skb_ulp_mode(skb) & ULP2_FLAG_HCRC_ERROR)) ? + ISCSI_SEGMENT_DGST_ERR : 0; + + hdr->itt = ntohl(hdr->itt); + segment->total_copied = segment->total_size; + tcp_conn->in.hdr = hdr; + err = iscsi_tcp_hdr_dissect(conn, hdr); + if (err) + return err; + + if (tcp_conn->in.datalen) { + segment = &tcp_conn->in.segment; + segment->status = (conn->datadgst_en && + (skb_ulp_mode(skb) & ULP2_FLAG_DCRC_ERROR)) ? + ISCSI_SEGMENT_DGST_ERR : 0; + if (skb_ulp_mode(skb) & ULP2_FLAG_DATA_DDPED) { + cxgb3i_log_debug("opcode 0x%x, data %u, ddp'ed.\n", + hdr->opcode & ISCSI_OPCODE_MASK, + tcp_conn->in.datalen); + segment->total_copied = segment->total_size; + } else { + cxgb3i_log_debug("opcode 0x%x, data %u, not ddp'ed.\n", + hdr->opcode & ISCSI_OPCODE_MASK, + tcp_conn->in.datalen); + offset += sizeof(struct cpl_iscsi_hdr_norss); + } + while (segment->total_copied < segment->total_size) { + iscsi_tcp_segment_map(segment, 1); + err = skb_copy_bits(skb, offset, segment->data, + segment->size); + iscsi_tcp_segment_unmap(segment); + if (err) + return err; + segment->total_copied += segment->size; + offset += segment->size; + + if (segment->total_copied < segment->total_size) + iscsi_tcp_segment_init_sg(segment, + sg_next(segment->sg), + 0); + } + err = segment->done(tcp_conn, segment); + } + return err; +} + +static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc) +{ + u8 submode = 0; + if (hcrc) + submode |= 1; + if (dcrc) + submode |= 2; + skb_ulp_mode(skb) = (ULP_MODE_ISCSI << 4) | submode; +} + +int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct iscsi_segment *hdr_seg = &tcp_conn->out.segment; + struct iscsi_segment *data_seg = &tcp_conn->out.data_segment; + struct sock *sk = tcp_conn->sock->sk; + unsigned int hdrlen = hdr_seg->total_size; + unsigned int datalen = data_seg->total_size; + unsigned int padlen = iscsi_padding(datalen); + unsigned int copymax = SKB_MAX_HEAD(TX_HEADER_LEN); + unsigned int copylen; + struct sk_buff *skb; + unsigned char *dst; + int err = -EAGAIN; + + if (conn->suspend_tx) + return 0; + + if (data_seg->data && ((datalen + padlen) < copymax)) + copylen = hdrlen + datalen + padlen; + else + copylen = hdrlen; + + /* supports max. 16K pdus, so one skb is enough to hold all the data */ + skb = alloc_skb(TX_HEADER_LEN + copylen, sk->sk_allocation); + if (!skb) + return -EAGAIN; + + skb_reserve(skb, TX_HEADER_LEN); + skb_put(skb, copylen); + dst = skb->data; + + tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0); + + memcpy(dst, hdr_seg->data, hdrlen); + dst += hdrlen; + + if (!datalen) + goto send_pdu; + + if (data_seg->data) { + /* data is in a linear buffer */ + if (copylen > hdrlen) { + /* data fits in the skb's headroom */ + memcpy(dst, data_seg->data, datalen); + dst += datalen; + if (padlen) + memset(dst, 0, padlen); + } else { + unsigned int offset = 0; + while (datalen) { + struct page *page = + alloc_pages(sk->sk_allocation, 0); + int idx = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[idx]; + + if (!page) + goto free_skb; + + frag->page = page; + frag->page_offset = 0; + if (datalen > PAGE_SIZE) + frag->size = PAGE_SIZE; + else + frag->size = datalen; + memcpy(page_address(page), + data_seg->data + offset, frag->size); + + skb_shinfo(skb)->nr_frags++; + datalen -= frag->size; + offset += frag->size; + } + } + } else { + struct scatterlist *sg = data_seg->sg; + unsigned int offset = data_seg->sg_offset; + while (datalen) { + int idx = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[idx]; + struct page *pg = sg_page(sg); + + get_page(pg); + frag->page = pg; + frag->page_offset = offset + sg->offset; + frag->size = min(sg->length, datalen); + + offset = 0; + skb_shinfo(skb)->nr_frags++; + datalen -= frag->size; + sg = sg_next(sg); + } + } + + if (skb_shinfo(skb)->nr_frags) { + if (padlen) { + int idx = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[idx]; + frag->page = pad_page; + frag->page_offset = 0; + frag->size = padlen; + skb_shinfo(skb)->nr_frags++; + } + datalen = data_seg->total_size + padlen; + skb->data_len += datalen; + skb->truesize += datalen; + skb->len += datalen; + } + +send_pdu: + err = cxgb3i_tcp_sendskb(tcp_conn->sock->sk, skb, + MSG_DONTWAIT | MSG_NOSIGNAL); + if (err > 0) { + int pdulen = hdrlen + datalen + padlen; + if (conn->hdrdgst_en) + pdulen += ISCSI_DIGEST_SIZE; + if (datalen && conn->datadgst_en) + pdulen += ISCSI_DIGEST_SIZE; + + hdr_seg->total_copied = hdr_seg->total_size; + if (datalen) + data_seg->total_copied = data_seg->total_size; + conn->txdata_octets += pdulen; + return pdulen; + } + +free_skb: + kfree_skb(skb); + if (err != -EAGAIN) { + cxgb3i_log_error("conn 0x%p, xmit err %d.\n", conn, err); + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + return err; + } + return -EAGAIN; +} + +int cxgb3i_ulp2_init(void) +{ + pad_page = alloc_page(GFP_KERNEL); + if (!pad_page) + return -ENOMEM; + memset(page_address(pad_page), 0, PAGE_SIZE); + cxgb3i_ddp_page_init(); + return 0; +} + +void cxgb3i_ulp2_cleanup(void) +{ + if (pad_page) { + __free_page(pad_page); + pad_page = NULL; + } +} + +static void cxgb3i_sk_data_ready(struct sock *sk, int flag) +{ + struct sk_buff *skb; + unsigned int read = 0; + struct iscsi_conn *conn = sk->sk_user_data; + int err = 0; + + if (unlikely(conn->suspend_rx)) { + cxgb3i_log_debug("conn %d Rx suspended!\n", conn->id); + return; + } + cxgb3i_log_debug("sk 0x%p, flag %d\n", sk, flag); + + read_lock(&sk->sk_callback_lock); + skb = skb_peek(&sk->sk_receive_queue); + while (!err && skb) { + __skb_unlink(skb, &sk->sk_receive_queue); + read += skb_ulp_pdulen(skb); + err = cxgb3i_conn_read_pdu_skb(conn, skb); + __kfree_skb(skb); + skb = skb_peek(&sk->sk_receive_queue); + } + read_unlock(&sk->sk_callback_lock); + if (tcp_sk(sk)) { + tcp_sk(sk)->copied_seq += read; + cxgb3i_tcp_cleanup_rbuf(sk, read); + } + conn->rxdata_octets += read; + + if (err) { + cxgb3i_log_info("conn 0x%p rx failed err %d.\n", conn, err); + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + } +} + +static void cxgb3i_sk_write_space(struct sock *sk) +{ + struct iscsi_conn *conn = (struct iscsi_conn *)sk->sk_user_data; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + + tcp_conn->old_write_space(sk); + cxgb3i_log_debug("sk 0x%p, cid %d.\n", sk, conn->id); + scsi_queue_work(conn->session->host, &conn->xmitwork); +} + +static void cxgb3i_sk_state_change(struct sock *sk) +{ + struct iscsi_tcp_conn *tcp_conn; + struct iscsi_conn *conn; + struct iscsi_session *session; + void (*old_state_change) (struct sock *); + + cxgb3i_log_debug("sk 0x%p, state %d\n", sk, sk->sk_state); + + read_lock(&sk->sk_callback_lock); + conn = (struct iscsi_conn *)sk->sk_user_data; + session = conn->session; + if ((sk->sk_state == TCP_CLOSE_WAIT || sk->sk_state == TCP_CLOSE) && + !atomic_read(&sk->sk_rmem_alloc)) + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + tcp_conn = conn->dd_data; + old_state_change = tcp_conn->old_state_change; + read_unlock(&sk->sk_callback_lock); + + old_state_change(sk); +} + +void cxgb3i_sk_set_callbacks(struct sock *sk, struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = conn; + tcp_conn->old_data_ready = sk->sk_data_ready; + tcp_conn->old_state_change = sk->sk_state_change; + tcp_conn->old_write_space = sk->sk_write_space; + sk->sk_data_ready = cxgb3i_sk_data_ready; + sk->sk_state_change = cxgb3i_sk_state_change; + sk->sk_write_space = cxgb3i_sk_write_space; + write_unlock_bh(&sk->sk_callback_lock); +} + +void cxgb3i_sk_restore_callbacks(struct sock *sk, struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = NULL; + sk->sk_data_ready = tcp_conn->old_data_ready; + sk->sk_state_change = tcp_conn->old_state_change; + sk->sk_write_space = tcp_conn->old_write_space; + sk->sk_no_check = 0; + write_unlock_bh(&sk->sk_callback_lock); +} + +/** + * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc. + * The allocated memory is cleared. + */ +static void *cxgb3i_alloc_big_mem(unsigned long size) +{ + void *p = kmalloc(size, GFP_KERNEL); + if (!p) + p = vmalloc(size); + memset(p, 0, size); + return p; +} + +/** + * Free memory allocated through cxgb3i_alloc_big_mem(). + */ +static void cxgb3i_free_big_mem(void *addr) +{ + unsigned long p = (unsigned long)addr; + if (p >= VMALLOC_START && p < VMALLOC_END) + vfree(addr); + else + kfree(addr); +} + +int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *snic) +{ + struct t3cdev *tdev = snic->tdev; + struct cxgb3i_ddp_info *ddp = &snic->ddp; + struct ulp_iscsi_info uinfo; + unsigned int ppmax, bits, max_bits; + int i, err; + + spin_lock_init(&ddp->map_lock); + + err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo); + if (err < 0) { + cxgb3i_log_error("%s, failed to get iscsi param err=%d.\n", + tdev->name, err); + return err; + } + + ppmax = (uinfo.ulimit - uinfo.llimit + 1) >> PPOD_SIZE_SHIFT; + max_bits = min(PPOD_IDX_MAX_SIZE, + (32 - sw_tag_idx_bits - sw_tag_age_bits)); + bits = __ilog2_u32(ppmax) + 1; + if (bits > max_bits) + bits = max_bits; + ppmax = (1 << bits) - 1; + + snic->tx_max_size = uinfo.max_txsz; + snic->rx_max_size = uinfo.max_rxsz; + snic->tag_format.idx_bits = sw_tag_idx_bits; + snic->tag_format.age_bits = sw_tag_age_bits; + snic->tag_format.rsvd_bits = bits; + snic->tag_format.rsvd_shift = PPOD_IDX_SHIFT; + snic->tag_format.rsvd_mask = (1 << snic->tag_format.rsvd_bits) - 1; + + cxgb3i_log_debug("snic nppods %u, rsvd shift %u, bits %u, mask 0x%x.\n", + ppmax, snic->tag_format.rsvd_shift, + snic->tag_format.rsvd_bits, + snic->tag_format.rsvd_mask); + + ddp->map = cxgb3i_alloc_big_mem(ppmax); + if (!ddp->map) { + cxgb3i_log_warn("snic unable to alloc ddp ppod 0x%u, " + "ddp disabled.\n", ppmax); + return 0; + } + ddp->llimit = uinfo.llimit; + ddp->ulimit = uinfo.ulimit; + + uinfo.tagmask = + snic->tag_format.rsvd_mask << snic->tag_format.rsvd_shift; + for (i = 0; i < ULP2_PGIDX_MAX; i++) + uinfo.pgsz_factor[i] = ddp_page_order[i]; + + err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo); + if (err < 0) { + cxgb3i_log_warn("snic unable to set iscsi param err=%d, " + "ddp disabled.\n", err); + goto free_ppod_map; + } + + ddp->nppods = ppmax; + ddp->idx_last = ppmax; + + tdev->ulp_iscsi = ddp; + + return 0; + +free_ppod_map: + cxgb3i_free_big_mem(ddp->map); + return 0; +} + +void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *snic) +{ + u8 *map = snic->ddp.map; + if (map) { + snic->tdev->ulp_iscsi = NULL; + spin_lock(&snic->lock); + snic->ddp.map = NULL; + spin_unlock(&snic->lock); + cxgb3i_free_big_mem(map); + } +} diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.h b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h new file mode 100644 index 0000000..b38df09 --- /dev/null +++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h @@ -0,0 +1,102 @@ +/* + * cxgb3i_ulp2.h: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@xxxxxxxxxxx) + */ + +#ifndef __CXGB3I_ULP2_H__ +#define __CXGB3I_ULP2_H__ + +#define PPOD_PAGES_MAX 4 +#define PPOD_PAGES_SHIFT 2 /* 4 pages per pod */ + +struct pagepod_hdr { + u32 vld_tid; + u32 pgsz_tag_clr; + u32 maxoffset; + u32 pgoffset; + u64 rsvd; +}; + +struct pagepod { + struct pagepod_hdr hdr; + u64 addr[PPOD_PAGES_MAX + 1]; +}; + +#define PPOD_SIZE sizeof(struct pagepod) /* 64 */ +#define PPOD_SIZE_SHIFT 6 + +#define PPOD_COLOR_SHIFT 0 +#define PPOD_COLOR_SIZE 6 +#define PPOD_COLOR_MASK ((1 << PPOD_COLOR_SIZE) - 1) + +#define PPOD_IDX_SHIFT PPOD_COLOR_SIZE +#define PPOD_IDX_MAX_SIZE 24 + +#define S_PPOD_TID 0 +#define M_PPOD_TID 0xFFFFFF +#define V_PPOD_TID(x) ((x) << S_PPOD_TID) + +#define S_PPOD_VALID 24 +#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID) +#define F_PPOD_VALID V_PPOD_VALID(1U) + +#define S_PPOD_COLOR 0 +#define M_PPOD_COLOR 0x3F +#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR) + +#define S_PPOD_TAG 6 +#define M_PPOD_TAG 0xFFFFFF +#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG) + +#define S_PPOD_PGSZ 30 +#define M_PPOD_PGSZ 0x3 +#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) + +struct cpl_iscsi_hdr_norss { + union opcode_tid ot; + u16 pdu_len_ddp; + u16 len; + u32 seq; + u16 urg; + u8 rsvd; + u8 status; +}; + +struct cpl_rx_data_ddp_norss { + union opcode_tid ot; + u16 urg; + u16 len; + u32 seq; + u32 nxt_seq; + u32 ulp_crc; + u32 ddp_status; +}; + +#define RX_DDP_STATUS_IPP_SHIFT 27 /* invalid pagepod */ +#define RX_DDP_STATUS_TID_SHIFT 26 /* tid mismatch */ +#define RX_DDP_STATUS_COLOR_SHIFT 25 /* color mismatch */ +#define RX_DDP_STATUS_OFFSET_SHIFT 24 /* offset mismatch */ +#define RX_DDP_STATUS_ULIMIT_SHIFT 23 /* ulimit error */ +#define RX_DDP_STATUS_TAG_SHIFT 22 /* tag mismatch */ +#define RX_DDP_STATUS_DCRC_SHIFT 21 /* dcrc error */ +#define RX_DDP_STATUS_HCRC_SHIFT 20 /* hcrc error */ +#define RX_DDP_STATUS_PAD_SHIFT 19 /* pad error */ +#define RX_DDP_STATUS_PPP_SHIFT 18 /* pagepod parity error */ +#define RX_DDP_STATUS_LLIMIT_SHIFT 17 /* llimit error */ +#define RX_DDP_STATUS_DDP_SHIFT 16 /* ddp'able */ +#define RX_DDP_STATUS_PMM_SHIFT 15 /* pagepod mismatch */ + +#define ULP2_FLAG_DATA_READY 0x1 +#define ULP2_FLAG_DATA_DDPED 0x2 +#define ULP2_FLAG_HCRC_ERROR 0x10 +#define ULP2_FLAG_DCRC_ERROR 0x20 +#define ULP2_FLAG_PAD_ERROR 0x40 + +#endif diff --git a/security/security.c b/security/security.c index 59838a9..bf27d33 100644 --- a/security/security.c +++ b/security/security.c @@ -1055,6 +1055,7 @@ void security_inet_conn_established(struct sock *sk, { security_ops->inet_conn_established(sk, skb); } +EXPORT_SYMBOL(security_inet_conn_established); #endif /* CONFIG_SECURITY_NETWORK */ -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html