Add AVX512BW variant of fletcher

It is much faster than AVX512F when byteswapping on Skylake-SP and newer, as we can do the byteswap in a single vshufb instead of many instructions. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Chunwei Chen <tuxoko@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #9517
2019-10-30 20:26:14 +01:00 · 2019-10-30 20:26:14 +01:00 · 0b2a642351
parent bae11ba8dc
commit 0b2a642351
4 changed files with 57 additions and 1 deletions
--- a/include/zfs_fletcher.h
+++ b/include/zfs_fletcher.h
@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops;
 extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)
 extern const fletcher_4_ops_t fletcher_4_avx512bw_ops;
 #endif
 #if defined(__aarch64__)
 extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
 #endif
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
 Select a fletcher 4 implementation.
 .sp
 Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
-\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR.
+\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
 All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
 set extensions to be available and will only appear if ZFS detects that they are
 present at runtime. If multiple implementations of fletcher 4 are available,
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = {
 #if defined(__x86_64) && defined(HAVE_AVX512F)
 	&fletcher_4_avx512f_ops,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX512BW)
 	&fletcher_4_avx512bw_ops,
 #endif
 #if defined(__aarch64__)
 	&fletcher_4_aarch64_neon_ops,
 #endif
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
 	.name = "avx512f"
 };
 #if defined(HAVE_AVX512BW)
 static void
 fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
    uint64_t size)
 {
 	static const zfs_fletcher_avx512_t mask = {
 		.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
 		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
 		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
 		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
 	};
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
 	kfpu_begin();
 	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
 	__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
 	for (; ip < ipend; ip += 8) {
 		__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
 		__asm("vpshufb %zmm5, %zmm4, %zmm4");
 		__asm("vpaddq %zmm4, %zmm0, %zmm0");
 		__asm("vpaddq %zmm0, %zmm1, %zmm1");
 		__asm("vpaddq %zmm1, %zmm2, %zmm2");
 		__asm("vpaddq %zmm2, %zmm3, %zmm3");
 	}
 	FLETCHER_4_AVX512_SAVE_CTX(ctx)
 	kfpu_end();
 }
 STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
 const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
 	.init_native = fletcher_4_avx512f_init,
 	.fini_native = fletcher_4_avx512f_fini,
 	.compute_native = fletcher_4_avx512f_native,
 	.init_byteswap = fletcher_4_avx512f_init,
 	.fini_byteswap = fletcher_4_avx512f_fini,
 	.compute_byteswap = fletcher_4_avx512bw_byteswap,
 	.valid = fletcher_4_avx512f_valid,
 	.name = "avx512bw"
 };
 #endif
 #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */