diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index 83f92a0963..85c2b5a7e8 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -62,12 +62,43 @@ void fletcher_4_init(void); void fletcher_4_fini(void); + +/* Internal fletcher ctx */ + +typedef struct zfs_fletcher_sse { + uint64_t v[2] __attribute__((aligned(16))); +} zfs_fletcher_sse_t; + +typedef struct zfs_fletcher_avx { + uint64_t v[4] __attribute__((aligned(32))); +} zfs_fletcher_avx_t; + +typedef struct zfs_fletcher_avx512 { + uint64_t v[8] __attribute__((aligned(64))); +} zfs_fletcher_avx512_t; + + +typedef union fletcher_4_ctx { + zio_cksum_t scalar; + +#if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3)) + zfs_fletcher_sse_t sse[4]; +#endif +#if defined(HAVE_AVX) && defined(HAVE_AVX2) + zfs_fletcher_avx_t avx[4]; +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) + zfs_fletcher_avx512_t avx512[4]; +#endif +} fletcher_4_ctx_t; + /* * fletcher checksum struct */ -typedef void (*fletcher_4_init_f)(zio_cksum_t *); -typedef void (*fletcher_4_fini_f)(zio_cksum_t *); -typedef void (*fletcher_4_compute_f)(const void *, uint64_t, zio_cksum_t *); +typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *); +typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *); +typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *, + const void *, uint64_t); typedef struct fletcher_4_func { fletcher_4_init_f init_native; @@ -80,6 +111,7 @@ typedef struct fletcher_4_func { const char *name; } fletcher_4_ops_t; + #if defined(HAVE_SSE2) extern const fletcher_4_ops_t fletcher_4_sse2_ops; #endif diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 3ca70db13e..355384f50b 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -138,17 +138,20 @@ #include -static void fletcher_4_scalar_init(zio_cksum_t *zcp); -static void fletcher_4_scalar_native(const void *buf, uint64_t size, - zio_cksum_t *zcp); -static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp); +static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); +static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); +static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size); +static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size); static boolean_t fletcher_4_scalar_valid(void); static const fletcher_4_ops_t fletcher_4_scalar_ops = { .init_native = fletcher_4_scalar_init, + .fini_native = fletcher_4_scalar_fini, .compute_native = fletcher_4_scalar_native, .init_byteswap = fletcher_4_scalar_init, + .fini_byteswap = fletcher_4_scalar_fini, .compute_byteswap = fletcher_4_scalar_byteswap, .valid = fletcher_4_scalar_valid, .name = "scalar" @@ -248,22 +251,29 @@ fletcher_2_byteswap(const void *buf, uint64_t size, } static void -fletcher_4_scalar_init(zio_cksum_t *zcp) +fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) { - ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); + ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); } static void -fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); +} + +static void +fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; + a = ctx->scalar.zc_word[0]; + b = ctx->scalar.zc_word[1]; + c = ctx->scalar.zc_word[2]; + d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += ip[0]; @@ -272,20 +282,21 @@ fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp) d += c; } - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } static void -fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; + a = ctx->scalar.zc_word[0]; + b = ctx->scalar.zc_word[1]; + c = ctx->scalar.zc_word[2]; + d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += BSWAP_32(ip[0]); @@ -294,7 +305,7 @@ fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) d += c; } - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } static boolean_t @@ -384,13 +395,14 @@ fletcher_4_impl_get(void) } static inline void -fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf, - uint64_t size, zio_cksum_t *zcp) +fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { - ops->init_native(zcp); - ops->compute_native(buf, size, zcp); - if (ops->fini_native != NULL) - ops->fini_native(zcp); + fletcher_4_ctx_t ctx; + const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + + ops->init_native(&ctx); + ops->compute_native(&ctx, buf, size); + ops->fini_native(&ctx, zcp); } /*ARGSUSED*/ @@ -398,40 +410,41 @@ void fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { - const fletcher_4_ops_t *ops; - uint64_t p2size = P2ALIGN(size, 64); + const uint64_t p2size = P2ALIGN(size, 64); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); - if (size == 0) { + if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); - } else if (p2size == 0) { - ops = &fletcher_4_scalar_ops; - fletcher_4_native_impl(ops, buf, size, zcp); + + if (size > 0) + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, + buf, size); } else { - ops = fletcher_4_impl_get(); - fletcher_4_native_impl(ops, buf, p2size, zcp); + fletcher_4_native_impl(buf, p2size, zcp); if (p2size < size) - fletcher_4_incremental_native((char *)buf + p2size, - size - p2size, zcp); + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, + (char *)buf + p2size, size - p2size); } } void fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) { - fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp); + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); } static inline void -fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf, - uint64_t size, zio_cksum_t *zcp) +fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { - ops->init_byteswap(zcp); - ops->compute_byteswap(buf, size, zcp); - if (ops->fini_byteswap != NULL) - ops->fini_byteswap(zcp); + fletcher_4_ctx_t ctx; + const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + + ops->init_byteswap(&ctx); + ops->compute_byteswap(&ctx, buf, size); + ops->fini_byteswap(&ctx, zcp); } /*ARGSUSED*/ @@ -439,28 +452,29 @@ void fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { - const fletcher_4_ops_t *ops; - uint64_t p2size = P2ALIGN(size, 64); + const uint64_t p2size = P2ALIGN(size, 64); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); - if (size == 0) { + if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); - } else if (p2size == 0) { - ops = &fletcher_4_scalar_ops; - fletcher_4_byteswap_impl(ops, buf, size, zcp); + + if (size > 0) + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, + buf, size); } else { - ops = fletcher_4_impl_get(); - fletcher_4_byteswap_impl(ops, buf, p2size, zcp); + fletcher_4_byteswap_impl(buf, p2size, zcp); if (p2size < size) - fletcher_4_incremental_byteswap((char *)buf + p2size, - size - p2size, zcp); + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, + (char *)buf + p2size, size - p2size); } } /* Incremental Fletcher 4 */ +#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) + static inline void fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, const zio_cksum_t *nzcp) @@ -469,6 +483,13 @@ fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, const uint64_t c2 = c1 * (c1 + 1) / 2; const uint64_t c3 = c2 * (c1 + 2) / 3; + /* + * Value of 'c3' overflows on buffer sizes close to 16MiB. For that + * reason we split incremental fletcher4 computation of large buffers + * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. + */ + ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); + zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + @@ -481,13 +502,9 @@ static inline void fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, zio_cksum_t *zcp) { - static const uint64_t FLETCHER_4_INC_MAX = 8ULL << 20; - uint64_t len; - while (size > 0) { zio_cksum_t nzc; - - len = MIN(size, FLETCHER_4_INC_MAX); + uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); if (native) fletcher_4_native(buf, len, NULL, &nzc); @@ -504,14 +521,22 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, void fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp) { - fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); + /* Use scalar impl to directly update cksum of small blocks */ + if (size < SPA_MINBLOCKSIZE) + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); + else + fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); } void fletcher_4_incremental_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { - fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); + /* Use scalar impl to directly update cksum of small blocks */ + if (size < SPA_MINBLOCKSIZE) + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); + else + fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); } @@ -662,9 +687,6 @@ fletcher_4_init(void) membar_producer(); fletcher_4_initialized = B_TRUE; - - /* Use 'cycle' math selection method for userspace */ - VERIFY0(fletcher_4_impl_set("cycle")); return; #endif /* Benchmark all supported implementations */ diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 22e1f410f7..2d28ffb112 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -28,31 +28,73 @@ #include #include #include +#include #define __asm __asm__ __volatile__ -typedef struct { - uint64_t v[8] __attribute__((aligned(64))); -} zfs_avx512_t; - static void -fletcher_4_avx512f_init(zio_cksum_t *zcp) +fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); - - /* clear registers */ - __asm("vpxorq %zmm0, %zmm0, %zmm0"); - __asm("vpxorq %zmm1, %zmm1, %zmm1"); - __asm("vpxorq %zmm2, %zmm2, %zmm2"); - __asm("vpxorq %zmm3, %zmm3, %zmm3"); + bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t)); } static void -fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + static const uint64_t + CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, + CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, + DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, + DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, + DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; + + uint64_t A, B, C, D; + uint64_t i; + + A = ctx->avx512[0].v[0]; + B = 8 * ctx->avx512[1].v[0]; + C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; + D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + + DcB[0] * ctx->avx512[1].v[0]; + + for (i = 1; i < 8; i++) { + A += ctx->avx512[0].v[i]; + B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; + C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + + CcA[i] * ctx->avx512[0].v[i]; + D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + + DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; + } + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ +{ \ + __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ + __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ + __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ + __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ +} + +#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ +{ \ + __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ + __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ + __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ + __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ +} + +static void +fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + for (; ip < ipend; ip += 8) { __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); __asm("vpaddq %zmm4, %zmm0, %zmm0"); @@ -60,15 +102,24 @@ fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused) __asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm2, %zmm3, %zmm3"); } + + FLETCHER_4_AVX512_SAVE_CTX(ctx); + + kfpu_end(); } static void -fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) { static const uint64_t byteswap_mask = 0xFFULL; const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); __asm("vpsllq $8, %zmm8, %zmm9"); __asm("vpsllq $16, %zmm8, %zmm10"); @@ -94,49 +145,10 @@ fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) __asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm2, %zmm3, %zmm3"); } -} -static void -fletcher_4_avx512f_fini(zio_cksum_t *zcp) -{ - static const uint64_t - CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, - CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, - DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, - DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, - DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; - - zfs_avx512_t a, b, c, b8, c64, d512; - uint64_t A, B, C, D; - uint64_t i; - - __asm("vmovdqu64 %%zmm0, %0":"=m" (a)); - __asm("vmovdqu64 %%zmm1, %0":"=m" (b)); - __asm("vmovdqu64 %%zmm2, %0":"=m" (c)); - __asm("vpsllq $3, %zmm1, %zmm1"); - __asm("vpsllq $6, %zmm2, %zmm2"); - __asm("vpsllq $9, %zmm3, %zmm3"); - - __asm("vmovdqu64 %%zmm1, %0":"=m" (b8)); - __asm("vmovdqu64 %%zmm2, %0":"=m" (c64)); - __asm("vmovdqu64 %%zmm3, %0":"=m" (d512)); + FLETCHER_4_AVX512_SAVE_CTX(ctx) kfpu_end(); - - A = a.v[0]; - B = b8.v[0]; - C = c64.v[0] - CcB[0] * b.v[0]; - D = d512.v[0] - DcC[0] * c.v[0] + DcB[0] * b.v[0]; - - for (i = 1; i < 8; i++) { - A += a.v[i]; - B += b8.v[i] - i * a.v[i]; - C += c64.v[i] - CcB[i] * b.v[i] + CcA[i] * a.v[i]; - D += d512.v[i] - DcC[i] * c.v[i] + DcB[i] * b.v[i] - - DcA[i] * a.v[i]; - } - - ZIO_SET_CHECKSUM(zcp, A, B, C, D); } static boolean_t diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index adc4151c51..a479b9d569 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -45,58 +45,69 @@ #include #include #include +#include static void -fletcher_4_avx2_init(zio_cksum_t *zcp) +fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); - - /* clear avx2 registers */ - asm volatile("vpxor %ymm0, %ymm0, %ymm0"); - asm volatile("vpxor %ymm1, %ymm1, %ymm1"); - asm volatile("vpxor %ymm2, %ymm2, %ymm2"); - asm volatile("vpxor %ymm3, %ymm3, %ymm3"); + bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t)); } static void -fletcher_4_avx2_fini(zio_cksum_t *zcp) +fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { - uint64_t __attribute__((aligned(32))) a[4]; - uint64_t __attribute__((aligned(32))) b[4]; - uint64_t __attribute__((aligned(32))) c[4]; - uint64_t __attribute__((aligned(32))) d[4]; uint64_t A, B, C, D; - asm volatile("vmovdqu %%ymm0, %0":"=m" (a)); - asm volatile("vmovdqu %%ymm1, %0":"=m" (b)); - asm volatile("vmovdqu %%ymm2, %0":"=m" (c)); - asm volatile("vmovdqu %%ymm3, %0":"=m" (d)); - asm volatile("vzeroupper"); + A = ctx->avx[0].v[0] + ctx->avx[0].v[1] + + ctx->avx[0].v[2] + ctx->avx[0].v[3]; + B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] + + 4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] + + 4 * ctx->avx[1].v[3]; - kfpu_end(); + C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] - + 10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] - + 18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] + + 16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] + + 16 * ctx->avx[2].v[3]; - A = a[0] + a[1] + a[2] + a[3]; - B = 0 - a[1] - 2*a[2] - 3*a[3] - + 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3]; - - C = a[2] + 3*a[3] - - 6*b[0] - 10*b[1] - 14*b[2] - 18*b[3] - + 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3]; - - D = 0 - a[3] - + 4*b[0] + 10*b[1] + 20*b[2] + 34*b[3] - - 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3] - + 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3]; + D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] + + 10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] + + 34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] - + 64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] - + 96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] + + 64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] + + 64 * ctx->avx[3].v[3]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } +#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \ +{ \ + asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \ + asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \ + asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \ + asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \ +} + +#define FLETCHER_4_AVX2_SAVE_CTX(ctx) \ +{ \ + asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \ + asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \ + asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \ + asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \ +} + + static void -fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_AVX2_RESTORE_CTX(ctx); + for (; ip < ipend; ip += 2) { asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); @@ -104,21 +115,28 @@ fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); } + + FLETCHER_4_AVX2_SAVE_CTX(ctx); + asm volatile("vzeroupper"); + + kfpu_end(); } static void -fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { - static const struct { - uint64_t v[4] __attribute__((aligned(32))); - } mask = { + static const zfs_fletcher_avx_t mask = { .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); - asm volatile("vmovdqa %0, %%ymm5"::"m"(mask)); + kfpu_begin(); + + FLETCHER_4_AVX2_RESTORE_CTX(ctx); + + asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask)); for (; ip < ipend; ip += 2) { asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); @@ -129,6 +147,11 @@ fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); } + + FLETCHER_4_AVX2_SAVE_CTX(ctx); + asm volatile("vzeroupper"); + + kfpu_end(); } static boolean_t fletcher_4_avx2_valid(void) diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index 9bc5f7ab6f..ae03f42173 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -45,39 +45,19 @@ #include #include +#include #include - -struct zfs_fletcher_sse_array { - uint64_t v[2] __attribute__((aligned(16))); -}; +#include static void -fletcher_4_sse2_init(zio_cksum_t *zcp) -{ - kfpu_begin(); - - /* clear sse registers */ - asm volatile("pxor %xmm0, %xmm0"); - asm volatile("pxor %xmm1, %xmm1"); - asm volatile("pxor %xmm2, %xmm2"); - asm volatile("pxor %xmm3, %xmm3"); +fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { + bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t)); } static void -fletcher_4_sse2_fini(zio_cksum_t *zcp) -{ - struct zfs_fletcher_sse_array a, b, c, d; +fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; - asm volatile("movdqu %%xmm0, %0":"=m" (a.v)); - asm volatile("movdqu %%xmm1, %0":"=m" (b.v)); - asm volatile("psllq $0x2, %xmm2"); - asm volatile("movdqu %%xmm2, %0":"=m" (c.v)); - asm volatile("psllq $0x3, %xmm3"); - asm volatile("movdqu %%xmm3, %0":"=m" (d.v)); - - kfpu_end(); - /* * The mixing matrix for checksum calculation is: * a = a0 + a1 @@ -88,20 +68,42 @@ fletcher_4_sse2_fini(zio_cksum_t *zcp) * c and d are multiplied by 4 and 8, respectively, * before spilling the vectors out to memory. */ - A = a.v[0] + a.v[1]; - B = 2*b.v[0] + 2*b.v[1] - a.v[1]; - C = c.v[0] - b.v[0] + c.v[1] - 3*b.v[1]; - D = d.v[0] - c.v[0] + d.v[1] - 2*c.v[1] + b.v[1]; + A = ctx->sse[0].v[0] + ctx->sse[0].v[1]; + B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1]; + C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] - + 3 * ctx->sse[1].v[1]; + D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] - + 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } +#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ +{ \ + asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \ + asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \ + asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \ + asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \ +} + +#define FLETCHER_4_SSE_SAVE_CTX(ctx) \ +{ \ + asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \ + asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \ + asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \ + asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \ +} + static void -fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + asm volatile("pxor %xmm4, %xmm4"); for (; ip < ipend; ip += 2) { @@ -118,27 +120,37 @@ fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static void -fletcher_4_sse2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); - for (; ip < ipend; ip += 2) { - uint32_t scratch; + kfpu_begin(); - asm volatile("bswapl %0" : "=r"(scratch) : "0"(*ip)); - asm volatile("movd %0, %%xmm5" :: "r"(scratch)); - asm volatile("bswapl %0" : "=r"(scratch) : "0"(*(ip + 1))); - asm volatile("movd %0, %%xmm6" :: "r"(scratch)); + FLETCHER_4_SSE_RESTORE_CTX(ctx); + + for (; ip < ipend; ip += 2) { + uint32_t scratch1 = BSWAP_32(ip[0]); + uint32_t scratch2 = BSWAP_32(ip[1]); + asm volatile("movd %0, %%xmm5" :: "r"(scratch1)); + asm volatile("movd %0, %%xmm6" :: "r"(scratch2)); asm volatile("punpcklqdq %xmm6, %xmm5"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static boolean_t fletcher_4_sse2_valid(void) @@ -161,15 +173,19 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = { #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) static void -fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { - static const struct zfs_fletcher_sse_array mask = { + static const zfs_fletcher_sse_t mask = { .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + asm volatile("movdqu %0, %%xmm7"::"m" (mask)); asm volatile("pxor %xmm4, %xmm4"); @@ -188,6 +204,10 @@ fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static boolean_t fletcher_4_ssse3_valid(void)