lib/crypto: x86/sha1: Migrate optimized code into library

Instead of exposing the x86-optimized SHA-1 code via x86-specific
crypto_shash algorithms, instead just implement the sha1_blocks()
library function.  This is much simpler, it makes the SHA-1 library
functions be x86-optimized, and it fixes the longstanding issue where
the x86-optimized SHA-1 code was disabled by default.  SHA-1 still
remains available through crypto_shash, but individual architectures no
longer need to handle it.

To match sha1_blocks(), change the type of the nblocks parameter of the
assembly functions from int to size_t.  The assembly functions actually
already treated it as size_t.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-14-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
This commit is contained in:
Eric Biggers 2025-07-12 16:23:04 -07:00
parent c751059985
commit f3d6cb3dc0
9 changed files with 94 additions and 368 deletions

View file

@ -376,20 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI
Architecture: x86_64 using:
- CLMUL-NI (carry-less multiplication new instructions)
config CRYPTO_SHA1_SSSE3
tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
depends on 64BIT
select CRYPTO_SHA1
select CRYPTO_HASH
help
SHA-1 secure hash algorithm (FIPS 180)
Architecture: x86_64 using:
- SSSE3 (Supplemental SSE3)
- AVX (Advanced Vector Extensions)
- AVX2 (Advanced Vector Extensions 2)
- SHA-NI (SHA Extensions New Instructions)
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT

View file

@ -51,9 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
endif
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o

View file

@ -1,324 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Cryptographic API.
*
* Glue code for the SHA1 Secure Hash Algorithm assembler implementations
* using SSSE3, AVX, AVX2, and SHA-NI instructions.
*
* This file is based on sha1_generic.c
*
* Copyright (c) Alan Smithee.
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
* Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
#include <crypto/internal/hash.h>
#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/module.h>
static const struct x86_cpu_id module_cpu_ids[] = {
X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data,
unsigned int len, sha1_block_fn *sha1_xform)
{
int remain;
/*
* Make sure struct sha1_state begins directly with the SHA1
* 160-bit internal state, as this is what the asm functions expect.
*/
BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
kernel_fpu_begin();
remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
kernel_fpu_end();
return remain;
}
static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out,
sha1_block_fn *sha1_xform)
{
kernel_fpu_begin();
sha1_base_do_finup(desc, data, len, sha1_xform);
kernel_fpu_end();
return sha1_base_finish(desc, out);
}
asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
const u8 *data, int blocks);
static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update_x86(desc, data, len, sha1_transform_ssse3);
}
static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
}
static struct shash_alg sha1_ssse3_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ssse3_update,
.finup = sha1_ssse3_finup,
.descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
return crypto_register_shash(&sha1_ssse3_alg);
return 0;
}
static void unregister_sha1_ssse3(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shash(&sha1_ssse3_alg);
}
asmlinkage void sha1_transform_avx(struct sha1_state *state,
const u8 *data, int blocks);
static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update_x86(desc, data, len, sha1_transform_avx);
}
static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out, sha1_transform_avx);
}
static struct shash_alg sha1_avx_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx_update,
.finup = sha1_avx_finup,
.descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (boot_cpu_has(X86_FEATURE_AVX))
pr_info("AVX detected but unusable.\n");
return false;
}
return true;
}
static int register_sha1_avx(void)
{
if (avx_usable())
return crypto_register_shash(&sha1_avx_alg);
return 0;
}
static void unregister_sha1_avx(void)
{
if (avx_usable())
crypto_unregister_shash(&sha1_avx_alg);
}
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(struct sha1_state *state,
const u8 *data, int blocks);
static bool avx2_usable(void)
{
if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
&& boot_cpu_has(X86_FEATURE_BMI1)
&& boot_cpu_has(X86_FEATURE_BMI2))
return true;
return false;
}
static inline void sha1_apply_transform_avx2(struct sha1_state *state,
const u8 *data, int blocks)
{
/* Select the optimal transform based on data block size */
if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
sha1_transform_avx2(state, data, blocks);
else
sha1_transform_avx(state, data, blocks);
}
static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2);
}
static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
}
static struct shash_alg sha1_avx2_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx2_update,
.finup = sha1_avx2_finup,
.descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_avx2(void)
{
if (avx2_usable())
return crypto_register_shash(&sha1_avx2_alg);
return 0;
}
static void unregister_sha1_avx2(void)
{
if (avx2_usable())
crypto_unregister_shash(&sha1_avx2_alg);
}
asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
int rounds);
static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha1_update_x86(desc, data, len, sha1_ni_transform);
}
static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return sha1_finup(desc, data, len, out, sha1_ni_transform);
}
static struct shash_alg sha1_ni_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ni_update,
.finup = sha1_ni_finup,
.descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ni",
.cra_priority = 250,
.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
};
static int register_sha1_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
return crypto_register_shash(&sha1_ni_alg);
return 0;
}
static void unregister_sha1_ni(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI))
crypto_unregister_shash(&sha1_ni_alg);
}
static int __init sha1_ssse3_mod_init(void)
{
if (!x86_match_cpu(module_cpu_ids))
return -ENODEV;
if (register_sha1_ssse3())
goto fail;
if (register_sha1_avx()) {
unregister_sha1_ssse3();
goto fail;
}
if (register_sha1_avx2()) {
unregister_sha1_avx();
unregister_sha1_ssse3();
goto fail;
}
if (register_sha1_ni()) {
unregister_sha1_avx2();
unregister_sha1_avx();
unregister_sha1_ssse3();
goto fail;
}
return 0;
fail:
return -ENODEV;
}
static void __exit sha1_ssse3_mod_fini(void)
{
unregister_sha1_ni();
unregister_sha1_avx2();
unregister_sha1_avx();
unregister_sha1_ssse3();
}
module_init(sha1_ssse3_mod_init);
module_exit(sha1_ssse3_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
MODULE_ALIAS_CRYPTO("sha1");
MODULE_ALIAS_CRYPTO("sha1-ssse3");
MODULE_ALIAS_CRYPTO("sha1-avx");
MODULE_ALIAS_CRYPTO("sha1-avx2");
MODULE_ALIAS_CRYPTO("sha1-ni");

View file

@ -152,6 +152,7 @@ config CRYPTO_LIB_SHA1_ARCH
default y if PPC
default y if S390
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA256
tristate

View file

@ -82,6 +82,9 @@ libsha1-y += powerpc/sha1-powerpc-asm.o
libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
endif
libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
x86/sha1-avx2-asm.o \
x86/sha1-ni-asm.o
endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
################################################################################

View file

@ -62,11 +62,8 @@
*Visit http://software.intel.com/en-us/articles/
*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
*
*Updates 20-byte SHA-1 record at start of 'state', from 'input', for
*even number of 'blocks' consecutive 64-byte blocks.
*
*extern "C" void sha1_transform_avx2(
* struct sha1_state *state, const u8* input, int blocks );
* void sha1_transform_avx2(struct sha1_block_state *state,
* const u8 *data, size_t nblocks);
*/
#include <linux/linkage.h>

View file

@ -54,7 +54,6 @@
*/
#include <linux/linkage.h>
#include <linux/cfi_types.h>
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
@ -74,26 +73,22 @@
/*
* Intel SHA Extensions optimized implementation of a SHA-1 update function
* Intel SHA Extensions optimized implementation of a SHA-1 block function
*
* The function takes a pointer to the current hash values, a pointer to the
* input data, and a number of 64 byte blocks to process. Once all blocks have
* been processed, the digest pointer is updated with the resulting hash value.
* The function only processes complete blocks, there is no functionality to
* store partial blocks. All message padding and hash value initialization must
* be done outside the update function.
* This function takes a pointer to the current SHA-1 state, a pointer to the
* input data, and the number of 64-byte blocks to process. Once all blocks
* have been processed, the state is updated with the new state. This function
* only processes complete blocks. State initialization, buffering of partial
* blocks, and digest finalization are expected to be handled elsewhere.
*
* The indented lines in the loop are instructions related to rounds processing.
* The non-indented lines are instructions related to the message schedule.
*
* void sha1_ni_transform(uint32_t *digest, const void *data,
uint32_t numBlocks)
* digest : pointer to digest
* data: pointer to input data
* numBlocks: Number of blocks to process
* void sha1_ni_transform(struct sha1_block_state *state,
* const u8 *data, size_t nblocks)
*/
.text
SYM_TYPED_FUNC_START(sha1_ni_transform)
SYM_FUNC_START(sha1_ni_transform)
push %rbp
mov %rsp, %rbp
sub $FRAME_SIZE, %rsp

View file

@ -25,7 +25,6 @@
*/
#include <linux/linkage.h>
#include <linux/cfi_types.h>
#define CTX %rdi // arg1
#define BUF %rsi // arg2
@ -68,7 +67,7 @@
* param: function's name
*/
.macro SHA1_VECTOR_ASM name
SYM_TYPED_FUNC_START(\name)
SYM_FUNC_START(\name)
push %rbx
push %r12
@ -461,10 +460,8 @@ W_PRECALC_SSSE3
/*
* SSSE3 optimized implementation:
*
* extern "C" void sha1_transform_ssse3(struct sha1_state *state,
* const u8 *data, int blocks);
*
* Note that struct sha1_state is assumed to begin with u32 state[5].
* void sha1_transform_ssse3(struct sha1_block_state *state,
* const u8 *data, size_t nblocks);
*/
SHA1_VECTOR_ASM sha1_transform_ssse3
@ -548,7 +545,7 @@ W_PRECALC_AVX
/* AVX optimized implementation:
* extern "C" void sha1_transform_avx(struct sha1_state *state,
* const u8 *data, int blocks);
* void sha1_transform_avx(struct sha1_block_state *state,
* const u8 *data, size_t nblocks);
*/
SHA1_VECTOR_ASM sha1_transform_avx

74
lib/crypto/x86/sha1.h Normal file
View file

@ -0,0 +1,74 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SHA-1 optimized for x86_64
*
* Copyright 2025 Google LLC
*/
#include <asm/fpu/api.h>
#include <linux/static_call.h>
DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \
asmlinkage void asm_fn(struct sha1_block_state *state, \
const u8 *data, size_t nblocks); \
static void c_fn(struct sha1_block_state *state, \
const u8 *data, size_t nblocks) \
{ \
if (likely(irq_fpu_usable())) { \
kernel_fpu_begin(); \
asm_fn(state, data, nblocks); \
kernel_fpu_end(); \
} else { \
sha1_blocks_generic(state, data, nblocks); \
} \
}
DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
const u8 *data, size_t nblocks);
static void sha1_blocks_avx2(struct sha1_block_state *state,
const u8 *data, size_t nblocks)
{
if (likely(irq_fpu_usable())) {
kernel_fpu_begin();
/* Select the optimal transform based on the number of blocks */
if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
sha1_transform_avx2(state, data, nblocks);
else
sha1_transform_avx(state, data, nblocks);
kernel_fpu_end();
} else {
sha1_blocks_generic(state, data, nblocks);
}
}
static void sha1_blocks(struct sha1_block_state *state,
const u8 *data, size_t nblocks)
{
static_call(sha1_blocks_x86)(state, data, nblocks);
}
#define sha1_mod_init_arch sha1_mod_init_arch
static inline void sha1_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
static_call_update(sha1_blocks_x86, sha1_blocks_ni);
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
NULL) &&
boot_cpu_has(X86_FEATURE_AVX)) {
if (boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_BMI1) &&
boot_cpu_has(X86_FEATURE_BMI2))
static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
else
static_call_update(sha1_blocks_x86, sha1_blocks_avx);
} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
}
}