Hey Heiko, Thanks for the patch, it generally looks good. A few comments. On Wed, Mar 29, 2023 at 7:08 AM Heiko Stuebner <heiko@xxxxxxxxx> wrote: > > From: Heiko Stuebner <heiko.stuebner@xxxxxxxx> > > With different sets of available extensions a number of different > implementation variants are possible. Quite a number of them are already > implemented in openSSL or are in the process of being implemented, so pick > the relevant openSSL coden and add suitable glue code similar to arm64 and > powerpc to use it for kernel-specific cryptography. > > The prioritization of the algorithms follows the ifdef chain for the > assembly callbacks done in openssl but here algorithms will get registered > separately so that all of them can be part of the crypto selftests. > > The crypto subsystem will select the most performant of all registered > algorithms on the running system but will selftest all registered ones. > > In a first step this adds scalar variants using the Zbc, Zbb and > possible Zbkb (bitmanip crypto extension) and the perl implementation > stems from openSSL pull request on > https://github.com/openssl/openssl/pull/20078 > > Co-developed-by: Christoph Müllner <christoph.muellner@xxxxxxxx> > Signed-off-by: Christoph Müllner <christoph.muellner@xxxxxxxx> > Signed-off-by: Heiko Stuebner <heiko.stuebner@xxxxxxxx> > --- > arch/riscv/crypto/Kconfig | 13 + > arch/riscv/crypto/Makefile | 14 + > arch/riscv/crypto/ghash-riscv64-glue.c | 258 ++++++++++++++++ > arch/riscv/crypto/ghash-riscv64-zbc.pl | 400 +++++++++++++++++++++++++ > arch/riscv/crypto/riscv.pm | 231 ++++++++++++++ > 5 files changed, 916 insertions(+) > create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c > create mode 100644 arch/riscv/crypto/ghash-riscv64-zbc.pl > create mode 100644 arch/riscv/crypto/riscv.pm > > diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig > index 10d60edc0110..cd2237923e68 100644 > --- a/arch/riscv/crypto/Kconfig > +++ b/arch/riscv/crypto/Kconfig > @@ -2,4 +2,17 @@ > > menu "Accelerated Cryptographic Algorithms for CPU (riscv)" > > +config CRYPTO_GHASH_RISCV64 > + tristate "Hash functions: GHASH" > + depends on 64BIT && RISCV_ISA_ZBC > + select CRYPTO_HASH > + select CRYPTO_LIB_GF128MUL > + help > + GCM GHASH function (NIST SP800-38D) > + > + Architecture: riscv64 using one of: > + - Zbc extension > + - Zbc + Zbb extensions > + - Zbc + Zbkb extensions > + > endmenu > diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile > index b3b6332c9f6d..0a158919e9da 100644 > --- a/arch/riscv/crypto/Makefile > +++ b/arch/riscv/crypto/Makefile > @@ -2,3 +2,17 @@ > # > # linux/arch/riscv/crypto/Makefile > # > + > +obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o > +ghash-riscv64-y := ghash-riscv64-glue.o > +ifdef CONFIG_RISCV_ISA_ZBC > +ghash-riscv64-y += ghash-riscv64-zbc.o > +endif > + > +quiet_cmd_perlasm = PERLASM $@ > + cmd_perlasm = $(PERL) $(<) void $(@) > + > +$(obj)/ghash-riscv64-zbc.S: $(src)/ghash-riscv64-zbc.pl > + $(call cmd,perlasm) > + > +clean-files += ghash-riscv64-zbc.S > diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c > new file mode 100644 > index 000000000000..5ab704c49539 > --- /dev/null > +++ b/arch/riscv/crypto/ghash-riscv64-glue.c > @@ -0,0 +1,258 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * RISC-V optimized GHASH routines > + * > + * Copyright (C) 2023 VRULL GmbH > + * Author: Heiko Stuebner <heiko.stuebner@xxxxxxxx> > + */ > + > +#include <linux/types.h> > +#include <linux/err.h> > +#include <linux/crypto.h> > +#include <linux/module.h> > +#include <asm/simd.h> > +#include <crypto/ghash.h> > +#include <crypto/internal/hash.h> > +#include <crypto/internal/simd.h> > + > +/* Zbc (optional with zbkb improvements) */ > +void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16], > + const u8 *inp, size_t len); > +void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16], > + const u8 *inp, size_t len); > + > +struct riscv64_ghash_ctx { > + void (*ghash_func)(u64 Xi[2], const u128 Htable[16], > + const u8 *inp, size_t len); > + > + /* key used by vector asm */ > + u128 htable[16]; This field looks too big. The assembly only loads the first 128-byte value from this table. Is this copied from another implementation? There's an optimization where you precompute the first N powers of H so that you can perform 1 finite field reduction for every N multiplications, but it doesn't look like that's being used here. > + /* key used by software fallback */ > + be128 key; > +}; > + > +struct riscv64_ghash_desc_ctx { > + u64 shash[2]; > + u8 buffer[GHASH_DIGEST_SIZE]; > + int bytes; > +}; > + > +static int riscv64_ghash_init(struct shash_desc *desc) > +{ > + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + > + dctx->bytes = 0; > + memset(dctx->shash, 0, GHASH_DIGEST_SIZE); > + return 0; > +} > + > +#ifdef CONFIG_RISCV_ISA_ZBC > + > +#define RISCV64_ZBC_SETKEY(VARIANT, GHASH) \ > +void gcm_init_rv64i_ ## VARIANT(u128 Htable[16], const u64 Xi[2]); \ > +static int riscv64_zbc_ghash_setkey_ ## VARIANT(struct crypto_shash *tfm, \ > + const u8 *key, \ > + unsigned int keylen) \ > +{ \ > + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm)); \ > + const u64 k[2] = { cpu_to_be64(((const u64 *)key)[0]), \ > + cpu_to_be64(((const u64 *)key)[1]) }; \ > + \ > + if (keylen != GHASH_BLOCK_SIZE) \ > + return -EINVAL; \ > + \ > + memcpy(&ctx->key, key, GHASH_BLOCK_SIZE); \ > + gcm_init_rv64i_ ## VARIANT(ctx->htable, k); \ > + \ > + ctx->ghash_func = gcm_ghash_rv64i_ ## GHASH; \ > + \ > + return 0; \ > +} I'd prefer three identical functions over a macro here. Code searching tools and compiler warnings are significantly worse with macros. > + > +static int riscv64_zbc_ghash_update(struct shash_desc *desc, > + const u8 *src, unsigned int srclen) > +{ > + unsigned int len; > + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm)); > + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + > + if (dctx->bytes) { > + if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) { > + memcpy(dctx->buffer + dctx->bytes, src, > + srclen); > + dctx->bytes += srclen; > + return 0; > + } > + memcpy(dctx->buffer + dctx->bytes, src, > + GHASH_DIGEST_SIZE - dctx->bytes); > + > + ctx->ghash_func(dctx->shash, ctx->htable, > + dctx->buffer, GHASH_DIGEST_SIZE); > + > + src += GHASH_DIGEST_SIZE - dctx->bytes; > + srclen -= GHASH_DIGEST_SIZE - dctx->bytes; > + dctx->bytes = 0; > + } > + len = srclen & ~(GHASH_DIGEST_SIZE - 1); > + > + if (len) { > + gcm_ghash_rv64i_zbc(dctx->shash, ctx->htable, > + src, len); > + src += len; > + srclen -= len; > + } > + > + if (srclen) { > + memcpy(dctx->buffer, src, srclen); > + dctx->bytes = srclen; > + } > + return 0; > +} > + > +static int riscv64_zbc_ghash_final(struct shash_desc *desc, u8 *out) > +{ > + int i; > + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm)); > + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + > + if (dctx->bytes) { > + for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++) > + dctx->buffer[i] = 0; > + ctx->ghash_func(dctx->shash, ctx->htable, > + dctx->buffer, GHASH_DIGEST_SIZE); Can we do this without an indirect call? > + dctx->bytes = 0; > + } > + memcpy(out, dctx->shash, GHASH_DIGEST_SIZE); > + return 0; > +} > + > +RISCV64_ZBC_SETKEY(zbc, zbc); > +struct shash_alg riscv64_zbc_ghash_alg = { > + .digestsize = GHASH_DIGEST_SIZE, > + .init = riscv64_ghash_init, > + .update = riscv64_zbc_ghash_update, > + .final = riscv64_zbc_ghash_final, > + .setkey = riscv64_zbc_ghash_setkey_zbc, > + .descsize = sizeof(struct riscv64_ghash_desc_ctx) > + + sizeof(struct ghash_desc_ctx), > + .base = { > + .cra_name = "ghash", > + .cra_driver_name = "riscv64_zbc_ghash", > + .cra_priority = 250, > + .cra_blocksize = GHASH_BLOCK_SIZE, > + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx), > + .cra_module = THIS_MODULE, > + }, > +}; > + > +RISCV64_ZBC_SETKEY(zbc__zbb, zbc); > +struct shash_alg riscv64_zbc_zbb_ghash_alg = { > + .digestsize = GHASH_DIGEST_SIZE, > + .init = riscv64_ghash_init, > + .update = riscv64_zbc_ghash_update, > + .final = riscv64_zbc_ghash_final, > + .setkey = riscv64_zbc_ghash_setkey_zbc__zbb, > + .descsize = sizeof(struct riscv64_ghash_desc_ctx) > + + sizeof(struct ghash_desc_ctx), > + .base = { > + .cra_name = "ghash", > + .cra_driver_name = "riscv64_zbc_zbb_ghash", > + .cra_priority = 251, > + .cra_blocksize = GHASH_BLOCK_SIZE, > + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx), > + .cra_module = THIS_MODULE, > + }, > +}; > + > +RISCV64_ZBC_SETKEY(zbc__zbkb, zbc__zbkb); > +struct shash_alg riscv64_zbc_zbkb_ghash_alg = { > + .digestsize = GHASH_DIGEST_SIZE, > + .init = riscv64_ghash_init, > + .update = riscv64_zbc_ghash_update, > + .final = riscv64_zbc_ghash_final, > + .setkey = riscv64_zbc_ghash_setkey_zbc__zbkb, > + .descsize = sizeof(struct riscv64_ghash_desc_ctx) > + + sizeof(struct ghash_desc_ctx), > + .base = { > + .cra_name = "ghash", > + .cra_driver_name = "riscv64_zbc_zbkb_ghash", > + .cra_priority = 252, > + .cra_blocksize = GHASH_BLOCK_SIZE, > + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx), > + .cra_module = THIS_MODULE, > + }, > +}; > + > +#endif /* CONFIG_RISCV_ISA_ZBC */ > + > +#define RISCV64_DEFINED_GHASHES 7 > + > +static struct shash_alg *riscv64_ghashes[RISCV64_DEFINED_GHASHES]; > +static int num_riscv64_ghashes; > + > +static int __init riscv64_ghash_register(struct shash_alg *ghash) > +{ > + int ret; > + > + ret = crypto_register_shash(ghash); > + if (ret < 0) { > + int i; > + > + for (i = num_riscv64_ghashes - 1; i >= 0 ; i--) > + crypto_unregister_shash(riscv64_ghashes[i]); > + > + num_riscv64_ghashes = 0; > + > + return ret; > + } > + > + pr_debug("Registered RISC-V ghash %s\n", ghash->base.cra_driver_name); > + riscv64_ghashes[num_riscv64_ghashes] = ghash; > + num_riscv64_ghashes++; > + return 0; > +} > + > +static int __init riscv64_ghash_mod_init(void) > +{ > + int ret = 0; > + > +#ifdef CONFIG_RISCV_ISA_ZBC > + if (riscv_isa_extension_available(NULL, ZBC)) { > + ret = riscv64_ghash_register(&riscv64_zbc_ghash_alg); > + if (ret < 0) > + return ret; > + > + if (riscv_isa_extension_available(NULL, ZBB)) { > + ret = riscv64_ghash_register(&riscv64_zbc_zbb_ghash_alg); > + if (ret < 0) > + return ret; > + } > + > + if (riscv_isa_extension_available(NULL, ZBKB)) { > + ret = riscv64_ghash_register(&riscv64_zbc_zbkb_ghash_alg); > + if (ret < 0) > + return ret; > + } > + } > +#endif > + > + return 0; > +} > + > +static void __exit riscv64_ghash_mod_fini(void) > +{ > + int i; > + > + for (i = num_riscv64_ghashes - 1; i >= 0 ; i--) > + crypto_unregister_shash(riscv64_ghashes[i]); > + > + num_riscv64_ghashes = 0; > +} > + > +module_init(riscv64_ghash_mod_init); > +module_exit(riscv64_ghash_mod_fini); > + > +MODULE_DESCRIPTION("GSM GHASH (accelerated)"); > +MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@xxxxxxxx>"); > +MODULE_LICENSE("GPL"); > +MODULE_ALIAS_CRYPTO("ghash"); > diff --git a/arch/riscv/crypto/ghash-riscv64-zbc.pl b/arch/riscv/crypto/ghash-riscv64-zbc.pl > new file mode 100644 > index 000000000000..691231ffa11c > --- /dev/null > +++ b/arch/riscv/crypto/ghash-riscv64-zbc.pl > @@ -0,0 +1,400 @@ > +#! /usr/bin/env perl > +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. > +# > +# Licensed under the Apache License 2.0 (the "License"). You may not use > +# this file except in compliance with the License. You can obtain a copy > +# in the file LICENSE in the source distribution or at > +# https://www.openssl.org/source/license.html > + > +use strict; > +use warnings; > + > +use FindBin qw($Bin); > +use lib "$Bin"; > +use lib "$Bin/../../perlasm"; > +use riscv; > + > +# $output is the last argument if it looks like a file (it has an extension) > +# $flavour is the first argument if it doesn't look like a file > +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; > +my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; > + > +$output and open STDOUT,">$output"; > + > +my $code=<<___; > +.text > +___ > + > +################################################################################ > +# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]); > +# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]); > +# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]); > +# > +# input: H: 128-bit H - secret parameter E(K, 0^128) > +# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and > +# gcm_ghash_rv64i_zbc* > +# > +# All callers of this function revert the byte-order unconditionally > +# on little-endian machines. So we need to revert the byte-order back. > +# Additionally we reverse the bits of each byte. > + > +{ > +my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_init_rv64i_zbc > +.type gcm_init_rv64i_zbc,\@function > +gcm_init_rv64i_zbc: > + ld $VAL0,0($H) > + ld $VAL1,8($H) > + @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]} > + @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]} > + @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]} > + @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]} > + ret > +.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc > +___ > +} > + > +{ > +my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_init_rv64i_zbc__zbb > +.type gcm_init_rv64i_zbc__zbb,\@function > +gcm_init_rv64i_zbc__zbb: > + ld $VAL0,0($H) > + ld $VAL1,8($H) > + @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]} > + @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]} > + @{[rev8 $VAL0, $VAL0]} > + @{[rev8 $VAL1, $VAL1]} > + sd $VAL0,0($Htable) > + sd $VAL1,8($Htable) > + ret > +.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb > +___ > +} > + > +{ > +my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_init_rv64i_zbc__zbkb > +.type gcm_init_rv64i_zbc__zbkb,\@function > +gcm_init_rv64i_zbc__zbkb: > + ld $TMP0,0($H) > + ld $TMP1,8($H) > + @{[brev8 $TMP0, $TMP0]} > + @{[brev8 $TMP1, $TMP1]} > + @{[rev8 $TMP0, $TMP0]} > + @{[rev8 $TMP1, $TMP1]} > + sd $TMP0,0($Htable) > + sd $TMP1,8($Htable) > + ret > +.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb > +___ > +} > + > +################################################################################ > +# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]); > +# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]); > +# > +# input: Xi: current hash value > +# Htable: copy of H > +# output: Xi: next hash value Xi > +# > +# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip) > +# extensions. Using the no-Karatsuba approach and clmul for the final reduction. > +# This results in an implementation with minimized number of instructions. > +# HW with clmul latencies higher than 2 cycles might observe a performance > +# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles > +# might observe a performance improvement with additionally converting the > +# reduction to shift&xor. For a full discussion of this estimates see > +# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc > +{ > +my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7"); > +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_gmult_rv64i_zbc > +.type gcm_gmult_rv64i_zbc,\@function > +gcm_gmult_rv64i_zbc: > + # Load Xi and bit-reverse it > + ld $x0, 0($Xi) > + ld $x1, 8($Xi) > + @{[brev8_rv64i $x0, $z0, $z1, $z2]} > + @{[brev8_rv64i $x1, $z0, $z1, $z2]} > + > + # Load the key (already bit-reversed) > + ld $y0, 0($Htable) > + ld $y1, 8($Htable) > + > + # Load the reduction constant > + la $polymod, Lpolymod > + lbu $polymod, 0($polymod) > + > + # Multiplication (without Karatsuba) > + @{[clmulh $z3, $x1, $y1]} > + @{[clmul $z2, $x1, $y1]} > + @{[clmulh $t1, $x0, $y1]} > + @{[clmul $z1, $x0, $y1]} > + xor $z2, $z2, $t1 > + @{[clmulh $t1, $x1, $y0]} > + @{[clmul $t0, $x1, $y0]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $x0, $y0]} > + @{[clmul $z0, $x0, $y0]} > + xor $z1, $z1, $t1 > + > + # Reduction with clmul > + @{[clmulh $t1, $z3, $polymod]} > + @{[clmul $t0, $z3, $polymod]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $z2, $polymod]} > + @{[clmul $t0, $z2, $polymod]} > + xor $x1, $z1, $t1 > + xor $x0, $z0, $t0 > + > + # Bit-reverse Xi back and store it > + @{[brev8_rv64i $x0, $z0, $z1, $z2]} > + @{[brev8_rv64i $x1, $z0, $z1, $z2]} > + sd $x0, 0($Xi) > + sd $x1, 8($Xi) > + ret > +.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc > +___ > +} > + > +{ > +my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7"); > +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_gmult_rv64i_zbc__zbkb > +.type gcm_gmult_rv64i_zbc__zbkb,\@function > +gcm_gmult_rv64i_zbc__zbkb: > + # Load Xi and bit-reverse it > + ld $x0, 0($Xi) > + ld $x1, 8($Xi) > + @{[brev8 $x0, $x0]} > + @{[brev8 $x1, $x1]} > + > + # Load the key (already bit-reversed) > + ld $y0, 0($Htable) > + ld $y1, 8($Htable) > + > + # Load the reduction constant > + la $polymod, Lpolymod > + lbu $polymod, 0($polymod) > + > + # Multiplication (without Karatsuba) > + @{[clmulh $z3, $x1, $y1]} > + @{[clmul $z2, $x1, $y1]} > + @{[clmulh $t1, $x0, $y1]} > + @{[clmul $z1, $x0, $y1]} > + xor $z2, $z2, $t1 > + @{[clmulh $t1, $x1, $y0]} > + @{[clmul $t0, $x1, $y0]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $x0, $y0]} > + @{[clmul $z0, $x0, $y0]} > + xor $z1, $z1, $t1 > + > + # Reduction with clmul > + @{[clmulh $t1, $z3, $polymod]} > + @{[clmul $t0, $z3, $polymod]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $z2, $polymod]} > + @{[clmul $t0, $z2, $polymod]} > + xor $x1, $z1, $t1 > + xor $x0, $z0, $t0 > + > + # Bit-reverse Xi back and store it > + @{[brev8 $x0, $x0]} > + @{[brev8 $x1, $x1]} > + sd $x0, 0($Xi) > + sd $x1, 8($Xi) > + ret > +.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb > +___ > +} > + > +################################################################################ > +# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16], > +# const u8 *inp, size_t len); > +# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16], > +# const u8 *inp, size_t len); > +# > +# input: Xi: current hash value > +# Htable: copy of H > +# inp: pointer to input data > +# len: length of input data in bytes (mutiple of block size) > +# output: Xi: Xi+1 (next hash value Xi) > +{ > +my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7"); > +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_ghash_rv64i_zbc > +.type gcm_ghash_rv64i_zbc,\@function > +gcm_ghash_rv64i_zbc: > + # Load Xi and bit-reverse it > + ld $x0, 0($Xi) > + ld $x1, 8($Xi) > + @{[brev8_rv64i $x0, $z0, $z1, $z2]} > + @{[brev8_rv64i $x1, $z0, $z1, $z2]} > + > + # Load the key (already bit-reversed) > + ld $y0, 0($Htable) > + ld $y1, 8($Htable) > + > + # Load the reduction constant > + la $polymod, Lpolymod > + lbu $polymod, 0($polymod) > + > +Lstep: > + # Load the input data, bit-reverse them, and XOR them with Xi > + ld $t0, 0($inp) > + ld $t1, 8($inp) > + add $inp, $inp, 16 > + add $len, $len, -16 > + @{[brev8_rv64i $t0, $z0, $z1, $z2]} > + @{[brev8_rv64i $t1, $z0, $z1, $z2]} > + xor $x0, $x0, $t0 > + xor $x1, $x1, $t1 > + > + # Multiplication (without Karatsuba) > + @{[clmulh $z3, $x1, $y1]} > + @{[clmul $z2, $x1, $y1]} > + @{[clmulh $t1, $x0, $y1]} > + @{[clmul $z1, $x0, $y1]} > + xor $z2, $z2, $t1 > + @{[clmulh $t1, $x1, $y0]} > + @{[clmul $t0, $x1, $y0]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $x0, $y0]} > + @{[clmul $z0, $x0, $y0]} > + xor $z1, $z1, $t1 > + > + # Reduction with clmul > + @{[clmulh $t1, $z3, $polymod]} > + @{[clmul $t0, $z3, $polymod]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $z2, $polymod]} > + @{[clmul $t0, $z2, $polymod]} > + xor $x1, $z1, $t1 > + xor $x0, $z0, $t0 > + > + # Iterate over all blocks > + bnez $len, Lstep > + > + # Bit-reverse final Xi back and store it > + @{[brev8_rv64i $x0, $z0, $z1, $z2]} > + @{[brev8_rv64i $x1, $z0, $z1, $z2]} > + sd $x0, 0($Xi) > + sd $x1, 8($Xi) > + ret > +.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc > +___ > +} > + > +{ > +my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7"); > +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6"); > + > +$code .= <<___; > +.p2align 3 > +.globl gcm_ghash_rv64i_zbc__zbkb > +.type gcm_ghash_rv64i_zbc__zbkb,\@function > +gcm_ghash_rv64i_zbc__zbkb: > + # Load Xi and bit-reverse it > + ld $x0, 0($Xi) > + ld $x1, 8($Xi) > + @{[brev8 $x0, $x0]} > + @{[brev8 $x1, $x1]} > + > + # Load the key (already bit-reversed) > + ld $y0, 0($Htable) > + ld $y1, 8($Htable) > + > + # Load the reduction constant > + la $polymod, Lpolymod > + lbu $polymod, 0($polymod) > + > +Lstep_zkbk: > + # Load the input data, bit-reverse them, and XOR them with Xi > + ld $t0, 0($inp) > + ld $t1, 8($inp) > + add $inp, $inp, 16 > + add $len, $len, -16 > + @{[brev8 $t0, $t0]} > + @{[brev8 $t1, $t1]} > + xor $x0, $x0, $t0 > + xor $x1, $x1, $t1 > + > + # Multiplication (without Karatsuba) > + @{[clmulh $z3, $x1, $y1]} > + @{[clmul $z2, $x1, $y1]} > + @{[clmulh $t1, $x0, $y1]} > + @{[clmul $z1, $x0, $y1]} > + xor $z2, $z2, $t1 > + @{[clmulh $t1, $x1, $y0]} > + @{[clmul $t0, $x1, $y0]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $x0, $y0]} > + @{[clmul $z0, $x0, $y0]} > + xor $z1, $z1, $t1 > + > + # Reduction with clmul > + @{[clmulh $t1, $z3, $polymod]} > + @{[clmul $t0, $z3, $polymod]} > + xor $z2, $z2, $t1 > + xor $z1, $z1, $t0 > + @{[clmulh $t1, $z2, $polymod]} > + @{[clmul $t0, $z2, $polymod]} > + xor $x1, $z1, $t1 > + xor $x0, $z0, $t0 > + > + # Iterate over all blocks > + bnez $len, Lstep_zkbk > + > + # Bit-reverse final Xi back and store it > + @{[brev8 $x0, $x0]} > + @{[brev8 $x1, $x1]} > + sd $x0, 0($Xi) > + sd $x1, 8($Xi) > + ret > +.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb > +___ > +} > + > +$code .= <<___; > +.p2align 3 > +Lbrev8_const: > + .dword 0xAAAAAAAAAAAAAAAA > + .dword 0xCCCCCCCCCCCCCCCC > + .dword 0xF0F0F0F0F0F0F0F0 > +.size Lbrev8_const,.-Lbrev8_const > + > +Lpolymod: > + .byte 0x87 > +.size Lpolymod,.-Lpolymod > +___ > + > +print $code; > + > +close STDOUT or die "error closing STDOUT: $!"; > diff --git a/arch/riscv/crypto/riscv.pm b/arch/riscv/crypto/riscv.pm > new file mode 100644 > index 000000000000..b0c786a13ca0 > --- /dev/null > +++ b/arch/riscv/crypto/riscv.pm > @@ -0,0 +1,231 @@ > +#! /usr/bin/env perl > +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. > +# > +# Licensed under the Apache License 2.0 (the "License"). You may not use > +# this file except in compliance with the License. You can obtain a copy > +# in the file LICENSE in the source distribution or at > +# https://www.openssl.org/source/license.html > + > +use strict; > +use warnings; > + > +# Set $have_stacktrace to 1 if we have Devel::StackTrace > +my $have_stacktrace = 0; > +if (eval {require Devel::StackTrace;1;}) { > + $have_stacktrace = 1; > +} > + > +my @regs = map("x$_",(0..31)); > +# Mapping from the RISC-V psABI ABI mnemonic names to the register number. > +my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1', > + map("a$_",(0..7)), > + map("s$_",(2..11)), > + map("t$_",(3..6)) > +); > + > +my %reglookup; > +@reglookup{@regs} = @regs; > +@reglookup{@regaliases} = @regs; > + > +# Takes a register name, possibly an alias, and converts it to a register index > +# from 0 to 31 > +sub read_reg { > + my $reg = lc shift; > + if (!exists($reglookup{$reg})) { > + my $trace = ""; > + if ($have_stacktrace) { > + $trace = Devel::StackTrace->new->as_string; > + } > + die("Unknown register ".$reg."\n".$trace); > + } > + my $regstr = $reglookup{$reg}; > + if (!($regstr =~ /^x([0-9]+)$/)) { > + my $trace = ""; > + if ($have_stacktrace) { > + $trace = Devel::StackTrace->new->as_string; > + } > + die("Could not process register ".$reg."\n".$trace); > + } > + return $1; > +} > + > +# Helper functions > + > +sub brev8_rv64i { > + # brev8 without `brev8` instruction (only in Zbkb) > + # Bit-reverses the first argument and needs two scratch registers > + my $val = shift; > + my $t0 = shift; > + my $t1 = shift; > + my $brev8_const = shift; > + my $seq = <<___; > + la $brev8_const, Lbrev8_const > + > + ld $t0, 0($brev8_const) # 0xAAAAAAAAAAAAAAAA > + slli $t1, $val, 1 > + and $t1, $t1, $t0 > + and $val, $val, $t0 > + srli $val, $val, 1 > + or $val, $t1, $val > + > + ld $t0, 8($brev8_const) # 0xCCCCCCCCCCCCCCCC > + slli $t1, $val, 2 > + and $t1, $t1, $t0 > + and $val, $val, $t0 > + srli $val, $val, 2 > + or $val, $t1, $val > + > + ld $t0, 16($brev8_const) # 0xF0F0F0F0F0F0F0F0 > + slli $t1, $val, 4 > + and $t1, $t1, $t0 > + and $val, $val, $t0 > + srli $val, $val, 4 > + or $val, $t1, $val > +___ > + return $seq; > +} > + > +sub sd_rev8_rv64i { > + # rev8 without `rev8` instruction (only in Zbb or Zbkb) > + # Stores the given value byte-reversed and needs one scratch register > + my $val = shift; > + my $addr = shift; > + my $off = shift; > + my $tmp = shift; > + my $off0 = ($off + 0); > + my $off1 = ($off + 1); > + my $off2 = ($off + 2); > + my $off3 = ($off + 3); > + my $off4 = ($off + 4); > + my $off5 = ($off + 5); > + my $off6 = ($off + 6); > + my $off7 = ($off + 7); > + my $seq = <<___; > + sb $val, $off7($addr) > + srli $tmp, $val, 8 > + sb $tmp, $off6($addr) > + srli $tmp, $val, 16 > + sb $tmp, $off5($addr) > + srli $tmp, $val, 24 > + sb $tmp, $off4($addr) > + srli $tmp, $val, 32 > + sb $tmp, $off3($addr) > + srli $tmp, $val, 40 > + sb $tmp, $off2($addr) > + srli $tmp, $val, 48 > + sb $tmp, $off1($addr) > + srli $tmp, $val, 56 > + sb $tmp, $off0($addr) > +___ > + return $seq; > +} > + > +# Scalar crypto instructions > + > +sub aes64ds { > + # Encoding for aes64ds rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0011101_00000_00000_000_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64dsm { > + # Encoding for aes64dsm rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0011111_00000_00000_000_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64es { > + # Encoding for aes64es rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0011001_00000_00000_000_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64esm { > + # Encoding for aes64esm rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0011011_00000_00000_000_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64im { > + # Encoding for aes64im rd, rs1 instruction on RV64 > + # XXXXXXXXXXXX_ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b001100000000_00000_001_00000_0010011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + return ".word ".($template | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64ks1i { > + # Encoding for aes64ks1i rd, rs1, rnum instruction on RV64 > + # XXXXXXXX_rnum_ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b00110001_0000_00000_001_00000_0010011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rnum = shift; > + return ".word ".($template | ($rnum << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub aes64ks2 { > + # Encoding for aes64ks2 rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0111111_00000_00000_000_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub brev8 { > + # brev8 rd, rs > + my $template = 0b011010000111_00000_101_00000_0010011; > + my $rd = read_reg shift; > + my $rs = read_reg shift; > + return ".word ".($template | ($rs << 15) | ($rd << 7)); > +} > + > +sub clmul { > + # Encoding for clmul rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0000101_00000_00000_001_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub clmulh { > + # Encoding for clmulh rd, rs1, rs2 instruction on RV64 > + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX > + my $template = 0b0000101_00000_00000_011_00000_0110011; > + my $rd = read_reg shift; > + my $rs1 = read_reg shift; > + my $rs2 = read_reg shift; > + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); > +} > + > +sub rev8 { > + # Encoding for rev8 rd, rs instruction on RV64 > + # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX > + my $template = 0b011010111000_00000_101_00000_0010011; > + my $rd = read_reg shift; > + my $rs = read_reg shift; > + return ".word ".($template | ($rs << 15) | ($rd << 7)); > +} > + > +1; > -- > 2.39.0 > Thanks, Huck