Hi All, In a research project, we've developed a crypto accelerator based on Xilinx Virtex5 FPGA family whichÂis connected to PC through PCI-Express slot and is used by IPSec to offload crypto processing fromÂCPU. The accelerator only provides AES and DES3_EDE algorithms and I am responsible for providing driver of the stuff. I inspired much of driver work from geode_aes.c which is located in "drivers/crypto" subdir of kernel source directory. Both algorithms are registered as blkcipher providing cbc wrapper "cbc(aes)" just as one that is registered in geode_aes. Now after months of work, the accelerator is ready to work (Correctness of hardware operation is assured by direct crypto test and not by IPSec) and it is time of driver to provide IPSec access to accelerator. In first try I could get Â"ping" through the IPsec tunnel. One end of IPSec tunnel is equipped by our accelerator and the other end is using kernel native IPSec and built in AES and DES3_EDE algorithms. Now I am faced with 2 problems: 1. Ping will stop getting reply with packet sizes greater than 1426 Bytes (ping dest_ip -s Â1427). I guessed that it might be MTU problem, but reducing mtu with "ifconfig eth1 mtu xxx" or "echo 1 > /proc/sys/net/ipv4/ip_no_pmtu_disc" Âdoes not solve the problem. Also when I ping each of tunnel ends from another end simultaneously with "ping other_node_ip -i 0.001", the kernel hangs out completely. 2. Iperf problem. When I try to measure throughput of the IPSec gateway equipped by our accelerator ( AES-MD5 ), using iperf in tcp mode, the kernel hangs such that sometimes "Magic SysRq key" does not respond too! And so I could not trace the problem anyway. Using iperf in udp mode works but I get "UDP bad cheksum" in 'dmesg' output of other end of tunnel (Native IPSec and built in kernel algorithms). Two gateways are connected by a cross cable and no router/switch is located between them to cause mtu problems. In my test pcrypt is not used by now and booting the kernel with nosmp (so no fear of thread contention) does not change the situation. So I request you to help me solve the problem. I bring some parts of driver that is changed from geode_aes.c and might give useful information. If it is required, I'll post all driver text. ------------------------------ ---------------------------- static struct crypto_alg mydriver_cbc_alg = {    Â.cra_name        =    "cbc(aes)",    Â.cra_driver_name    Â=    "cbc-aes-mydriver",    Â.cra_priority      =    400,    Â.cra_flags           Â=    CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK,    Â.cra_init            =    fallback_init_blk,    Â.cra_exit            =    fallback_exit_blk,    Â.cra_blocksize     Â=    AES_MIN_BLOCK_SIZE,    Â.cra_ctxsize      Â=    sizeof(struct mydriver_aes_op),    Â.cra_alignmask     Â=    15,    Â.cra_type            =    &crypto_blkcipher_type,    Â.cra_module           =    THIS_MODULE,    Â.cra_list            = LIST_HEAD_INIT(mydriver_cbc_alg.cra_list),    Â.cra_u             Â=    {        Â.blkcipher   Â=    {            Â.min_keysize  Â=    AES_MIN_KEY_SIZE,            Â.max_keysize  Â=    AES_MIN_KEY_SIZE,            Â.setkey         =    mydriver_setkey_blk,            Â.encrypt        Â=    mydriver_cbc_encrypt,            Â.decrypt        Â=    mydriver_cbc_decrypt,            Â.ivsize         =    AES_IV_LENGTH,        Â}    Â} }; //--------------- static int mydriver_cbc_encrypt(struct blkcipher_desc *desc,         Âstruct scatterlist *dst, struct scatterlist *src,         Âunsigned int nbytes) {    Âstruct mydriver_aes_op *op = crypto_blkcipher_ctx(desc->tfm);    Âstruct blkcipher_walk walk;    Âint err, ret;    Âif (unlikely(op->keylen != AES_KEYSIZE_128))        Âreturn fallback_blk_enc(desc, dst, src, nbytes);    Âblkcipher_walk_init(&walk, dst, src, nbytes);    Âerr = blkcipher_walk_virt(desc, &walk);    Âop->iv = walk.iv;    Âwhile((nbytes = walk.nbytes)) {        Âop->src = walk.src.virt.addr,        Âop->dst = walk.dst.virt.addr;        Âop->mode = AES_MODE_CBC;        Âop->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);        Âop->dir = AES_DIR_ENCRYPT;            Â//ret = mydriver_aes_crypt(op);        Âret = mydriver_transform(op, 0);        Ânbytes -= ret;        Âerr = blkcipher_walk_done(desc, &walk, nbytes);    Â}    Âreturn err; } /*--------- mydriver_transform which makes a buffer containing key, iv, data with some additional header that is required by our accelerator, writes the buffer to accelerator by DMA and then reads response from hardware.*/ static inline int mydriver_transform(struct mydriver_aes_op *op, int alg) {        Âint Âreq_len, err;        Âu8 *req_buf = NULL, *res_buf = NULL;        Âalg_operation operation;        Âu32 my_req_id;        Âif (op->len == 0)            Âreturn 0;        Âif ((op->dir == AES_DIR_ENCRYPT) ||(op->dir == DES3_DIR_ENCRYPT)){             operation = SH_ENCRYPT;             my_req_id = smp_processor_id();// This ID is put into our packet and is checked by each thread when the hardware response is ready to see if the packet is its?        Â}        Âelse {            Âoperation = SH_DECRYPT;            Âmy_req_id = smp_processor_id() + 64; //uniqueness of ID does not solve problem described in mail :( .        Â}        Âerr = create_request(alg, op->mode, operation, htonl(my_req_id), op->key, op->iv, op->src, op->len, &req_buf, &req_len);        Âif (err){            Âprintk(KERN_EMERG"mydriver_transform : Error CreateReuest : errcode = %d\n", err);            Â//goto error;        Â}        Âerr = write_request(req_buf, req_len);        Âif (err){            Âprintk(KERN_EMERG"mydriver_transform : Error WriteReuest : errcode = %d\n", err);            Â//goto error;        Â}        Âkfree(req_buf);        Âreq_buf = NULL;        Âerr = read_response(&res_buf, /*local_hdr.Length*/my_req_id);        Âmemcpy(op->dst, (res_buf + sizeof(struct response_hdr)), op->len);        Âkfree(res_buf);        Âres_buf = NULL;        Âreturn op->len; } //----------- /* create_request wich builds packet for mydriver_transform */ static inline int create_request(int alg, char mode, char enc_dec, u32 request_id,          char *key, char *iv, char *data, int datalen,          u8 **outbuf, int *outlen) {    Âint req_len, n_padding, keylen, blocklen, algid;    Âstruct request_hdr *p_hdr;    Âchar *ptr;    Âif (alg == 0){ //AES Algorithm        Âkeylen = 16;        Âblocklen = 16;        Âalgid = 4;    Â} else if (alg == 1){ //DES3 Algorithm        Âkeylen = 24;        Âblocklen = 8;        Âalgid = 3;    Â}    Âreq_len = sizeof(struct request_hdr) + keylen;    Âif (keylen != 0 && keylen % 16 == 0)        Âreq_len += 8; //For request packet to be 128bit aligned    Âif (mode == SHAMS_CBC)        Âreq_len += blocklen; // for IV len    Ân_padding = (blocklen - (datalen % blocklen)) % blocklen; //padding data to be multiple of 128 bits.    Âreq_len += (n_padding + datalen);    Â*outbuf = kmalloc(req_len, GFP_ATOMIC);    Âp_hdr = (struct request_hdr *) *outbuf;    Â*outlen = p_hdr->Length = req_len;    Âp_hdr->request_id = request_id;    Âp_hdr->AlgID_Mode_EncDec = (enc_dec << 15) | (mode << 12) | algid;    Â// Filling key    Âptr = *outbuf + sizeof(struct request_hdr);    Âmemcpy(ptr, key, keylen);    Âptr += keylen;    Âif (keylen != 0 && keylen % 16 == 0){        Âmemset(ptr, 0, 8);        Âptr += 8;    Â}    Â// Filling IV    Âif (mode == SHAMS_CBC){        Âmemcpy(ptr, iv, blocklen);        Âptr += blocklen;    Â}    Â// Copy data    Âmemcpy(ptr, data, datalen);    Âptr += datalen;    Â// Zeroing padd bits    Âmemset(ptr, 0, n_padding);    Âreturn 0; } //-------------------------------- /* write_request that writes the provided buffer to device */ static inline int write_request(u8 *buff, unsigned int count) { unsigned long Âiflags; u32 tlp_count, tlp_size; dma_addr_t dma_addr; struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0]; /** DMA operations:*/    Âdma_addr = pci_map_single(global_dev, buff, count, PCI_DMA_TODEVICE);    Âif (0 == dma_addr) {        Âprintk(KERN_EMERG"XPCIe_Read: Map error.\n");        Âreturn -1;    Â} // Do DMA transfer here....    Âcount = count /4;//    Âfor (tlp_size = 32; tlp_size > 0; tlp_size--)        Âif ((count % tlp_size) == 0){            Âtlp_count = count / tlp_size;            Âbreak;        Â}    Âtlp_size = tlp_count | (tlp_size << 16);    Âspin_lock_irqsave(&wlock, iflags);    Â//down(&my_sem); //   Âif (down_interruptible(&my_sem)){ //       Âprintk(KERN_EMERG "\nwrite_request: Error Acquire Semaphore!!"); //       Âreturn -ERESTARTSYS; //   Â}    Âwritel(cpu_to_le32(tlp_size),&desc_table->rdmatlpc);       // read DMA TLP count: ÂTLPs to transfer    Âwritel(cpu_to_le32(dma_addr),&desc_table->rdmatlpa); Â// physical bus address of DMA able buffer    Âwmb();    Âwritew(cpu_to_le16(0x0001),(global_bar[0]+6));        Â// read dma start bit[16] to ddmacr    Âwmb();    Âwhile(readw((global_bar[0]+6)) != 0x0101);    Âspin_unlock_irqrestore(&wlock, iflags);    Â//up(&my_sem);   Â // Unmap the DMA buffer so it is safe for normal access again.    Âpci_unmap_single(global_dev, dma_addr, count, PCI_DMA_TODEVICE);    Â/** End of dma section*/    Âreturn 0; } //-------------- /* read_response that reads the en/decrypted buffer from device */ static inline int read_response(u8 **buff, Âu16 my_req_id) {    Âdma_addr_t dma_addr;    Âu16 count, tmp_req_id;    Âunsigned long Âiflags1;//, iflags2;    Âu32 tlp_count, tlp_size;    Âstruct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0];    Âfor(;;){        Âspin_lock_irqsave(&alock, iflags1);        Âtmp_req_id = readw((global_bar[0] + 82 + (fifo_entry * 4)));        Âspin_unlock_irqrestore(&alock, iflags1);        Âif(my_req_id == tmp_req_id) // Is the provided packet mine?            Âbreak;    Â}            Âcount = readw(global_bar[0] + 80 + (fifo_entry * 4));//What is the size of my packet?            Âprintk(KERN_EMERG "read_response : my_req_id = %d has count = %d\n", my_req_id, count);            Â*buff = kmalloc(count, GFP_ATOMIC);            Âdma_addr = pci_map_single(global_dev, *buff, count, PCI_DMA_FROMDEVICE);            Âif (0 == dma_addr){                Âprintk(KERN_EMERG"XPCIe_Read: Map error.\n");                Âreturn -1;            Â}            Âcount = count /4;//            Âfor (tlp_size = 32; tlp_size > 0; tlp_size--)                Âif ((count % tlp_size) == 0){                    Âtlp_count = count / tlp_size;                    Âbreak;                Â}            Âtlp_size = tlp_count | (tlp_size << 16);    Â//       Âdown(&my_sem); //           Âif (down_interruptible(&my_sem)){ //               Âprintk(KERN_EMERG "\nread_response: Error Acquire Semaphore!!"); //               Âreturn -ERESTARTSYS; //           Â}            Âwritel(cpu_to_le32(tlp_size),&desc_table->wdmatlpc);    Â// read DMA TLP count: ÂTLPs to transfer            Âwritel(cpu_to_le32(dma_addr),&desc_table->wdmatlpa); Â// physical bus address of DMA able buffer            Âwmb();            Âwritew(cpu_to_le16(0x0001),(global_bar[0]+4));   // read dma start bit[16] to ddmacr            Âwmb();            Âwhile(readw(global_bar[0]+4) != 0x0101);            Âfifo_entry = (fifo_entry + 1) % 9; // 9 : Number of registers holding request_id and len of FiFo's elements .            Â//spin_unlock_irqrestore(&rlock, iflags2);            Â//up(&my_sem);            Âpci_unmap_single(global_dev, dma_addr, count, PCI_DMA_FROMDEVICE);            Âreturn count; } Thanks in advance, Hamid. ÿô.nÇ·®+%˱é¥wÿº{.nÇ·¥{±ý{ayºÊÚë¢f£¢·hïÿê_è(éÝj"ú§ÿÿ¾«þG«é¸?¨è&£ø