crypto accelerator driver problems

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi All,

In a research project, we've developed a crypto accelerator based on Xilinx
Virtex5 FPGA family whichÂis connected to PC through PCI-Express slot and is
used by IPSec to offload crypto processing fromÂCPU. The accelerator only
provides AES and DES3_EDE algorithms and I am responsible for providing driver
of the stuff. I inspired much of driver work from geode_aes.c which is
located in "drivers/crypto" subdir of kernel source directory. Both algorithms
are registered as blkcipher providing cbc wrapper "cbc(aes)" just as one that is
registered in geode_aes. Now after months of work, the accelerator is ready to
work (Correctness of hardware operation is assured by direct crypto
test and not by IPSec) and it is time of driver to provide IPSec
access to accelerator. In first
try I could get Â"ping" through the IPsec tunnel. One end of IPSec tunnel is
equipped by our accelerator and the other end is using kernel native IPSec and
built in AES and DES3_EDE algorithms. Now I am faced with 2 problems:

1. Ping will stop getting reply with packet sizes greater than 1426 Bytes
(ping dest_ip -s Â1427). I guessed that it might be MTU problem, but reducing
mtu with "ifconfig eth1 mtu xxx" or
"echo 1 > /proc/sys/net/ipv4/ip_no_pmtu_disc"
Âdoes not solve the problem. Also when I ping each of tunnel ends from
another end
simultaneously with "ping other_node_ip -i 0.001", the kernel hangs
out completely.

2. Iperf problem. When I try to measure throughput of the IPSec gateway equipped
by our accelerator ( AES-MD5 ), using iperf in tcp mode, the kernel hangs such
that sometimes "Magic SysRq key" does not respond too! And so I could not trace
the problem anyway. Using iperf in udp mode works but I get "UDP bad cheksum" in
'dmesg' output of other end of tunnel (Native IPSec and built in kernel
algorithms).

Two gateways are connected by a cross cable and no router/switch is located
between them to cause mtu problems. In my test pcrypt is not used by now and
booting the kernel with nosmp (so no fear of thread contention) does not change
the situation.

So I request you to help me solve the problem. I bring some parts of driver
that is changed from geode_aes.c and might give useful information. If
it is required,
I'll post all driver text.
------------------------------ ----------------------------

static struct crypto_alg mydriver_cbc_alg = {
   Â.cra_name        =    "cbc(aes)",
   Â.cra_driver_name    Â=    "cbc-aes-mydriver",
   Â.cra_priority      =    400,
   Â.cra_flags           Â=    CRYPTO_ALG_TYPE_BLKCIPHER |

CRYPTO_ALG_NEED_FALLBACK,
   Â.cra_init            =    fallback_init_blk,
   Â.cra_exit            =    fallback_exit_blk,
   Â.cra_blocksize     Â=    AES_MIN_BLOCK_SIZE,
   Â.cra_ctxsize      Â=    sizeof(struct mydriver_aes_op),
   Â.cra_alignmask     Â=    15,
   Â.cra_type            =    &crypto_blkcipher_type,
   Â.cra_module           =    THIS_MODULE,
   Â.cra_list            =
LIST_HEAD_INIT(mydriver_cbc_alg.cra_list),
   Â.cra_u             Â=    {
       Â.blkcipher   Â=    {
           Â.min_keysize  Â=    AES_MIN_KEY_SIZE,
           Â.max_keysize  Â=    AES_MIN_KEY_SIZE,
           Â.setkey         =    mydriver_setkey_blk,
           Â.encrypt        Â=    mydriver_cbc_encrypt,
           Â.decrypt        Â=    mydriver_cbc_decrypt,
           Â.ivsize         =    AES_IV_LENGTH,
       Â}
   Â}
};
//---------------
static int
mydriver_cbc_encrypt(struct blkcipher_desc *desc,
        Âstruct scatterlist *dst, struct scatterlist *src,
        Âunsigned int nbytes)
{

   Âstruct mydriver_aes_op *op = crypto_blkcipher_ctx(desc->tfm);
   Âstruct blkcipher_walk walk;
   Âint err, ret;

   Âif (unlikely(op->keylen != AES_KEYSIZE_128))
       Âreturn fallback_blk_enc(desc, dst, src, nbytes);

   Âblkcipher_walk_init(&walk, dst, src, nbytes);
   Âerr = blkcipher_walk_virt(desc, &walk);
   Âop->iv = walk.iv;

   Âwhile((nbytes = walk.nbytes)) {

       Âop->src = walk.src.virt.addr,
       Âop->dst = walk.dst.virt.addr;
       Âop->mode = AES_MODE_CBC;
       Âop->len = nbytes - (nbytes % AES_MIN_BLOCK_SIZE);
       Âop->dir = AES_DIR_ENCRYPT;
           Â//ret = mydriver_aes_crypt(op);
       Âret = mydriver_transform(op, 0);
       Ânbytes -= ret;
       Âerr = blkcipher_walk_done(desc, &walk, nbytes);
   Â}

   Âreturn err;
}
/*--------- mydriver_transform which makes a buffer containing key, iv, data
with
some additional header that is required by our accelerator, writes the buffer
to accelerator by DMA and then reads response from hardware.*/

static inline int mydriver_transform(struct mydriver_aes_op *op, int alg)
{

       Âint Âreq_len, err;
       Âu8 *req_buf = NULL, *res_buf = NULL;
       Âalg_operation operation;
       Âu32 my_req_id;
       Âif (op->len == 0)
           Âreturn 0;

       Âif ((op->dir == AES_DIR_ENCRYPT) ||(op->dir ==
DES3_DIR_ENCRYPT)){
            operation = SH_ENCRYPT;
            my_req_id = smp_processor_id();// This ID is
put into our packet and is checked by each thread when the hardware
response is ready to see if the packet is its?
       Â}
       Âelse {
           Âoperation = SH_DECRYPT;
           Âmy_req_id = smp_processor_id() + 64;
//uniqueness of ID does not solve problem described in mail :( .
       Â}



       Âerr = create_request(alg, op->mode, operation, htonl(my_req_id),
op->key, op->iv, op->src, op->len, &req_buf, &req_len);


       Âif (err){
           Âprintk(KERN_EMERG"mydriver_transform : Error
CreateReuest :
errcode = %d\n", err);
           Â//goto error;
       Â}

       Âerr = write_request(req_buf, req_len);
       Âif (err){
           Âprintk(KERN_EMERG"mydriver_transform : Error WriteReuest
:
errcode = %d\n", err);
           Â//goto error;
       Â}
       Âkfree(req_buf);
       Âreq_buf = NULL;

       Âerr = read_response(&res_buf, /*local_hdr.Length*/my_req_id);

       Âmemcpy(op->dst, (res_buf + sizeof(struct response_hdr)),
op->len);

       Âkfree(res_buf);
       Âres_buf = NULL;
       Âreturn op->len;
}
//-----------
/* create_request wich builds packet for mydriver_transform */
static inline int create_request(int alg, char mode, char enc_dec, u32
request_id,
         char *key, char *iv, char *data, int datalen,
         u8 **outbuf, int *outlen)
{
   Âint req_len, n_padding, keylen, blocklen, algid;
   Âstruct request_hdr *p_hdr;
   Âchar *ptr;

   Âif (alg == 0){ //AES Algorithm
       Âkeylen = 16;
       Âblocklen = 16;
       Âalgid = 4;
   Â} else if (alg == 1){ //DES3 Algorithm
       Âkeylen = 24;
       Âblocklen = 8;
       Âalgid = 3;
   Â}

   Âreq_len = sizeof(struct request_hdr) + keylen;
   Âif (keylen != 0 && keylen % 16 == 0)
       Âreq_len += 8; //For request packet to be 128bit aligned
   Âif (mode == SHAMS_CBC)
       Âreq_len += blocklen; // for IV len

   Ân_padding = (blocklen - (datalen % blocklen)) % blocklen; //padding
data to be multiple of 128 bits.

   Âreq_len += (n_padding + datalen);
   Â*outbuf = kmalloc(req_len, GFP_ATOMIC);
   Âp_hdr = (struct request_hdr *) *outbuf;
   Â*outlen = p_hdr->Length = req_len;

   Âp_hdr->request_id = request_id;
   Âp_hdr->AlgID_Mode_EncDec = (enc_dec << 15) | (mode << 12) | algid;
   Â// Filling key
   Âptr = *outbuf + sizeof(struct request_hdr);
   Âmemcpy(ptr, key, keylen);
   Âptr += keylen;
   Âif (keylen != 0 && keylen % 16 == 0){
       Âmemset(ptr, 0, 8);
       Âptr += 8;
   Â}
   Â// Filling IV
   Âif (mode == SHAMS_CBC){
       Âmemcpy(ptr, iv, blocklen);
       Âptr += blocklen;
   Â}
   Â// Copy data
   Âmemcpy(ptr, data, datalen);
   Âptr += datalen;
   Â// Zeroing padd bits
   Âmemset(ptr, 0, n_padding);

   Âreturn 0;

}
//--------------------------------
/* write_request that writes the provided buffer to device */

static inline int write_request(u8 *buff, unsigned int count)
{
unsigned long Âiflags;
u32 tlp_count, tlp_size;
dma_addr_t dma_addr;
struct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc *)global_bar[0];

/** DMA operations:*/
   Âdma_addr = pci_map_single(global_dev, buff, count, PCI_DMA_TODEVICE);
   Âif (0 == dma_addr) {
       Âprintk(KERN_EMERG"XPCIe_Read: Map error.\n");
       Âreturn -1;
   Â}

// Do DMA transfer here....
   Âcount = count /4;//
   Âfor (tlp_size = 32; tlp_size > 0; tlp_size--)
       Âif ((count % tlp_size) == 0){
           Âtlp_count = count / tlp_size;
           Âbreak;
       Â}

   Âtlp_size = tlp_count | (tlp_size << 16);
   Âspin_lock_irqsave(&wlock, iflags);
   Â//down(&my_sem);
// Â Â Âif (down_interruptible(&my_sem)){
// Â Â Â Â Â Â Âprintk(KERN_EMERG "\nwrite_request: Error Acquire Semaphore!!");
// Â Â Â Â Â Â Âreturn -ERESTARTSYS;
// Â Â Â}
   Âwritel(cpu_to_le32(tlp_size),&desc_table->rdmatlpc);       // read
DMA TLP count: ÂTLPs to transfer
   Âwritel(cpu_to_le32(dma_addr),&desc_table->rdmatlpa); Â// physical bus
address of DMA able buffer
   Âwmb();
   Âwritew(cpu_to_le16(0x0001),(global_bar[0]+6));        Â// read
dma start bit[16] to ddmacr
   Âwmb();
   Âwhile(readw((global_bar[0]+6)) != 0x0101);
   Âspin_unlock_irqrestore(&wlock, iflags);
   Â//up(&my_sem);
  Â // Unmap the DMA buffer so it is safe for normal access again.
   Âpci_unmap_single(global_dev, dma_addr, count, PCI_DMA_TODEVICE);

   Â/** End of dma section*/
   Âreturn 0;

}
//--------------
/* read_response that reads the en/decrypted buffer from device */

static inline int read_response(u8 **buff, Âu16 my_req_id)
{
   Âdma_addr_t dma_addr;
   Âu16 count, tmp_req_id;
   Âunsigned long Âiflags1;//, iflags2;
   Âu32 tlp_count, tlp_size;
   Âstruct x5pcie_dma_desc *desc_table = (struct x5pcie_dma_desc
*)global_bar[0];

   Âfor(;;){

       Âspin_lock_irqsave(&alock, iflags1);
       Âtmp_req_id = readw((global_bar[0] + 82 + (fifo_entry * 4)));
       Âspin_unlock_irqrestore(&alock, iflags1);
       Âif(my_req_id == tmp_req_id) // Is the provided packet mine?
           Âbreak;

   Â}

           Âcount = readw(global_bar[0] + 80 + (fifo_entry
* 4));//What is the size of my packet?
           Âprintk(KERN_EMERG "read_response : my_req_id = %d has
count = %d\n", my_req_id, count);

           Â*buff = kmalloc(count, GFP_ATOMIC);
           Âdma_addr = pci_map_single(global_dev, *buff, count,
PCI_DMA_FROMDEVICE);
           Âif (0 == dma_addr){
               Âprintk(KERN_EMERG"XPCIe_Read: Map error.\n");
               Âreturn -1;
           Â}

           Âcount = count /4;//
           Âfor (tlp_size = 32; tlp_size > 0; tlp_size--)
               Âif ((count % tlp_size) == 0){
                   Âtlp_count = count / tlp_size;
                   Âbreak;
               Â}

           Âtlp_size = tlp_count | (tlp_size << 16);
   Â//       Âdown(&my_sem);
// Â Â Â Â Â Â Â Â Â Â Âif (down_interruptible(&my_sem)){
// Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âprintk(KERN_EMERG "\nread_response: Error
Acquire Semaphore!!");
// Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âreturn -ERESTARTSYS;
// Â Â Â Â Â Â Â Â Â Â Â}
           Âwritel(cpu_to_le32(tlp_size),&desc_table->wdmatlpc);
   Â// read DMA TLP count: ÂTLPs to transfer
           Âwritel(cpu_to_le32(dma_addr),&desc_table->wdmatlpa); Â//
physical bus address of DMA able buffer
           Âwmb();
           Âwritew(cpu_to_le16(0x0001),(global_bar[0]+4));
  // read dma start bit[16] to ddmacr
           Âwmb();
           Âwhile(readw(global_bar[0]+4) != 0x0101);

           Âfifo_entry = (fifo_entry + 1) % 9; // 9 : Number of
registers holding request_id and len of FiFo's elements .
           Â//spin_unlock_irqrestore(&rlock, iflags2);
           Â//up(&my_sem);
           Âpci_unmap_single(global_dev, dma_addr, count,
PCI_DMA_FROMDEVICE);

           Âreturn count;

}


Thanks in advance,
Hamid.
ÿô.nlj·Ÿ®‰­†+%ŠË±é¥Šwÿº{.nlj·¥Š{±ýŠ{ayºÊÚë­¢f£¢·hšïÿ‘ê_è(­éŠÝj"ú§ÿÿ¾«þG«é¸?™¨è&£ø



[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]

  Powered by Linux