Fletcher4 implementation using avx512f instruction set

Algorithm runs 8 parallel sums, consuming 8x uint32_t elements per
loop iteration. Size alignment of main fletcher4 methods is adjusted
accordingly. New implementation is called 'avx512f'.

Note: byteswap method can be implemented more efficiently when avx512bw hardware
becomes available. Currently, it is ~ 2x slower than native method.

Table shows result of full (native) fletcher4 calculation for different buffer size:

fletcher4   4KB     16KB    64KB    128KB   256KB   1MB     16MB
--------------------------------------------------------------------
[scalar]    1213    1228    1231    1231    1225    1200    1160
[sse2]      2374    2442    2459    2456    2462    2250    2220
[avx2]      4288    4753    4871    4893    4900    4050    3882
[avx512f]   5975    8445    9196    9221    9262    6307    5620

Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4952
This commit is contained in:
Gvozden Neskovic 2016-07-06 13:42:04 +02:00 committed by Brian Behlendorf
parent 32ffaa3de5
commit 70b258fc96
6 changed files with 182 additions and 10 deletions

View File

@ -73,6 +73,10 @@ extern const fletcher_4_ops_t fletcher_4_ssse3_ops;
extern const fletcher_4_ops_t fletcher_4_avx2_ops;
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
#endif
#ifdef __cplusplus
}
#endif

View File

@ -24,6 +24,7 @@ KERNEL_C = \
zfs_fletcher.c \
zfs_fletcher_intel.c \
zfs_fletcher_sse.c \
zfs_fletcher_avx512.c \
zfs_namecheck.c \
zfs_prop.c \
zfs_uio.c \

View File

@ -883,14 +883,14 @@ Default value: \fB67,108,864\fR.
Select a fletcher 4 implementation.
.sp
Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
and \fBavx2\fR. All of the selectors except \fBfastest\fR and \fBscalar\fR
require instruction set extensions to be available and will only appear if ZFS
detects that they are present at runtime. If multiple implementations of
fletcher 4 are available, the \fBfastest\fR will be chosen using a micro
benchmark. Selecting \fBscalar\fR results in the original CPU based calculation
being used. Selecting any option other than \fBfastest\fR and \fBscalar\fR
results in vector instructions from the respective CPU instruction set being
used.
\fBavx2\fR, and \fBavx512f\fR.
All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
set extensions to be available and will only appear if ZFS detects that they are
present at runtime. If multiple implementations of fletcher 4 are available,
the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR
results in the original, CPU based calculation, being used. Selecting any option
other than \fBfastest\fR and \fBscalar\fR results in vector instructions from
the respective CPU instruction set being used.
.sp
Default value: \fBfastest\fR.
.RE

View File

@ -18,3 +18,4 @@ $(MODULE)-objs += zpool_prop.o
$(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o
$(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o
$(MODULE)-$(CONFIG_X86) += zfs_fletcher_avx512.o

View File

@ -158,6 +158,9 @@ static const fletcher_4_ops_t *fletcher_4_algos[] = {
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
&fletcher_4_avx2_ops,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
&fletcher_4_avx512f_ops,
#endif
};
static enum fletcher_selector {
@ -171,6 +174,9 @@ static enum fletcher_selector {
#endif
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
FLETCHER_AVX2,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
FLETCHER_AVX512F,
#endif
FLETCHER_CYCLE
} fletcher_4_impl_chosen = FLETCHER_SCALAR;
@ -190,6 +196,9 @@ static struct fletcher_4_impl_selector {
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
[ FLETCHER_AVX2 ] = { "avx2", &fletcher_4_avx2_ops },
#endif
#if defined(__x86_64) && defined(HAVE_AVX512F)
[ FLETCHER_AVX512F ] = { "avx512f", &fletcher_4_avx512f_ops },
#endif
#if !defined(_KERNEL)
[ FLETCHER_CYCLE ] = { "cycle", &fletcher_4_scalar_ops }
#endif
@ -354,7 +363,7 @@ fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
const fletcher_4_ops_t *ops;
if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
if (IS_P2ALIGNED(size, 8 * sizeof (uint32_t)))
ops = fletcher_4_impl_get();
else
ops = &fletcher_4_scalar_ops;
@ -370,7 +379,7 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
const fletcher_4_ops_t *ops;
if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t)))
if (IS_P2ALIGNED(size, 8 * sizeof (uint32_t)))
ops = fletcher_4_impl_get();
else
ops = &fletcher_4_scalar_ops;

View File

@ -0,0 +1,157 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
#if defined(__x86_64) && defined(HAVE_AVX512F)
#include <linux/simd_x86.h>
#include <sys/byteorder.h>
#include <sys/spa_checksum.h>
#include <zfs_fletcher.h>
#define __asm __asm__ __volatile__
typedef struct {
uint64_t v[8] __attribute__((aligned(64)));
} zfs_avx512_t;
static void
fletcher_4_avx512f_init(zio_cksum_t *zcp)
{
kfpu_begin();
/* clear registers */
__asm("vpxorq %zmm0, %zmm0, %zmm0");
__asm("vpxorq %zmm1, %zmm1, %zmm1");
__asm("vpxorq %zmm2, %zmm2, %zmm2");
__asm("vpxorq %zmm3, %zmm3, %zmm3");
}
static void
fletcher_4_avx512f(const void *buf, uint64_t size, zio_cksum_t *unused)
{
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
for (; ip < ipend; ip += 8) {
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
}
}
static void
fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
{
static const uint64_t byteswap_mask = 0xFFULL;
const uint32_t *ip = buf;
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
__asm("vpsllq $8, %zmm8, %zmm9");
__asm("vpsllq $16, %zmm8, %zmm10");
__asm("vpsllq $24, %zmm8, %zmm11");
for (; ip < ipend; ip += 8) {
__asm("vpmovzxdq %0, %%zmm5"::"m" (*ip));
__asm("vpsrlq $24, %zmm5, %zmm6");
__asm("vpandd %zmm8, %zmm6, %zmm6");
__asm("vpsrlq $8, %zmm5, %zmm7");
__asm("vpandd %zmm9, %zmm7, %zmm7");
__asm("vpord %zmm6, %zmm7, %zmm4");
__asm("vpsllq $8, %zmm5, %zmm6");
__asm("vpandd %zmm10, %zmm6, %zmm6");
__asm("vpord %zmm6, %zmm4, %zmm4");
__asm("vpsllq $24, %zmm5, %zmm5");
__asm("vpandd %zmm11, %zmm5, %zmm5");
__asm("vpord %zmm5, %zmm4, %zmm4");
__asm("vpaddq %zmm4, %zmm0, %zmm0");
__asm("vpaddq %zmm0, %zmm1, %zmm1");
__asm("vpaddq %zmm1, %zmm2, %zmm2");
__asm("vpaddq %zmm2, %zmm3, %zmm3");
}
}
static void
fletcher_4_avx512f_fini(zio_cksum_t *zcp)
{
static const uint64_t
CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 },
CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 },
DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 },
DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 },
DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
zfs_avx512_t a, b, c, b8, c64, d512;
uint64_t A, B, C, D;
uint64_t i;
__asm("vmovdqu64 %%zmm0, %0":"=m" (a));
__asm("vmovdqu64 %%zmm1, %0":"=m" (b));
__asm("vmovdqu64 %%zmm2, %0":"=m" (c));
__asm("vpsllq $3, %zmm1, %zmm1");
__asm("vpsllq $6, %zmm2, %zmm2");
__asm("vpsllq $9, %zmm3, %zmm3");
__asm("vmovdqu64 %%zmm1, %0":"=m" (b8));
__asm("vmovdqu64 %%zmm2, %0":"=m" (c64));
__asm("vmovdqu64 %%zmm3, %0":"=m" (d512));
kfpu_end();
A = a.v[0];
B = b8.v[0];
C = c64.v[0] - CcB[0] * b.v[0];
D = d512.v[0] - DcC[0] * c.v[0] + DcB[0] * b.v[0];
for (i = 1; i < 8; i++) {
A += a.v[i];
B += b8.v[i] - i * a.v[i];
C += c64.v[i] - CcB[i] * b.v[i] + CcA[i] * a.v[i];
D += d512.v[i] - DcC[i] * c.v[i] + DcB[i] * b.v[i] -
DcA[i] * a.v[i];
}
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}
static boolean_t
fletcher_4_avx512f_valid(void)
{
return (zfs_avx512f_available());
}
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.init = fletcher_4_avx512f_init,
.fini = fletcher_4_avx512f_fini,
.compute = fletcher_4_avx512f,
.compute_byteswap = fletcher_4_avx512f_byteswap,
.valid = fletcher_4_avx512f_valid,
.name = "avx512f"
};
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */