AVX512 : Implemented VPMADDUBSW and VPMADDWD instruction ,

Added tests for intrinsics and encoding.

Differential Revision: http://reviews.llvm.org/D11351

llvm-svn: 242761
This commit is contained in:
Igor Breger 2015-07-21 07:11:28 +00:00
parent b1600f2eee
commit f7fd547e27
10 changed files with 528 additions and 1 deletions

View File

@ -4511,6 +4511,36 @@ let TargetPrefix = "x86" in {
def int_x86_avx512_mask_pavg_w_256 : GCCBuiltin<"__builtin_ia32_pavgw256_mask">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pmaddw_d_128 :
GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">,
Intrinsic<[llvm_v4i32_ty],
[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pmaddw_d_256 :
GCCBuiltin<"__builtin_ia32_pmaddwd256_mask">,
Intrinsic<[llvm_v8i32_ty],
[llvm_v16i16_ty, llvm_v16i16_ty, llvm_v8i32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pmaddw_d_512 :
GCCBuiltin<"__builtin_ia32_pmaddwd512_mask">,
Intrinsic<[llvm_v16i32_ty],
[llvm_v32i16_ty, llvm_v32i16_ty, llvm_v16i32_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pmaddubs_w_128 :
GCCBuiltin<"__builtin_ia32_pmaddubsw128_mask">,
Intrinsic<[llvm_v8i16_ty],
[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v8i16_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pmaddubs_w_256 :
GCCBuiltin<"__builtin_ia32_pmaddubsw256_mask">,
Intrinsic<[llvm_v16i16_ty],
[llvm_v32i8_ty, llvm_v32i8_ty, llvm_v16i16_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_pmaddubs_w_512 :
GCCBuiltin<"__builtin_ia32_pmaddubsw512_mask">,
Intrinsic<[llvm_v32i16_ty],
[llvm_v64i8_ty, llvm_v64i8_ty, llvm_v32i16_ty, llvm_i32_ty],
[IntrNoMem]>;
}
// Gather and Scatter ops

View File

@ -19008,6 +19008,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";

View File

@ -403,7 +403,8 @@ namespace llvm {
PMULDQ,
// Vector Multiply Packed UnsignedIntegers with Round and Scale
MULHRS,
// Multiply and Add Packed Integers
VPMADDUBSW, VPMADDWD,
// FMA nodes
FMADD,
FNMADD,

View File

@ -3229,11 +3229,30 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
v16i8x_info>, EVEX_V128;
}
}
multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
SDNode OpNode, AVX512VLVectorVTInfo _Src,
AVX512VLVectorVTInfo _Dst> {
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
_Dst.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
_Dst.info256>, EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
_Dst.info128>, EVEX_V128;
}
}
let Predicates = [HasBWI] in {
defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
}
defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax,

View File

@ -288,6 +288,9 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>;
def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;

View File

@ -596,6 +596,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0),
X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),

View File

@ -1038,3 +1038,36 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
%res2 = add <16 x i32> %res, %res1
ret <16 x i32> %res2
}

View File

@ -3843,3 +3843,70 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
; CHECK: ## BB#0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
ret <4 x i32> %res2
}
declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
ret <8 x i32> %res2
}
declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
; CHECK: ## BB#0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
ret <8 x i16> %res2
}
declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}

View File

@ -3776,3 +3776,75 @@
// CHECK: encoding: [0x62,0xe2,0x25,0x40,0x0b,0xaa,0xc0,0xdf,0xff,0xff]
vpmulhrsw -8256(%rdx), %zmm27, %zmm21
// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27
// CHECK: encoding: [0x62,0x02,0x5d,0x40,0x04,0xd9]
vpmaddubsw %zmm25, %zmm20, %zmm27
// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3}
// CHECK: encoding: [0x62,0x02,0x5d,0x43,0x04,0xd9]
vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3}
// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z}
// CHECK: encoding: [0x62,0x02,0x5d,0xc3,0x04,0xd9]
vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z}
// CHECK: vpmaddubsw (%rcx), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x19]
vpmaddubsw (%rcx), %zmm20, %zmm27
// CHECK: vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x22,0x5d,0x40,0x04,0x9c,0xf0,0x23,0x01,0x00,0x00]
vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27
// CHECK: vpmaddubsw 8128(%rdx), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x7f]
vpmaddubsw 8128(%rdx), %zmm20, %zmm27
// CHECK: vpmaddubsw 8192(%rdx), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0x00,0x20,0x00,0x00]
vpmaddubsw 8192(%rdx), %zmm20, %zmm27
// CHECK: vpmaddubsw -8192(%rdx), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x80]
vpmaddubsw -8192(%rdx), %zmm20, %zmm27
// CHECK: vpmaddubsw -8256(%rdx), %zmm20, %zmm27
// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0xc0,0xdf,0xff,0xff]
vpmaddubsw -8256(%rdx), %zmm20, %zmm27
// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26
// CHECK: encoding: [0x62,0x01,0x4d,0x40,0xf5,0xd1]
vpmaddwd %zmm25, %zmm22, %zmm26
// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2}
// CHECK: encoding: [0x62,0x01,0x4d,0x42,0xf5,0xd1]
vpmaddwd %zmm25, %zmm22, %zmm26 {%k2}
// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z}
// CHECK: encoding: [0x62,0x01,0x4d,0xc2,0xf5,0xd1]
vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z}
// CHECK: vpmaddwd (%rcx), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x11]
vpmaddwd (%rcx), %zmm22, %zmm26
// CHECK: vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x21,0x4d,0x40,0xf5,0x94,0xf0,0x23,0x01,0x00,0x00]
vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26
// CHECK: vpmaddwd 8128(%rdx), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x7f]
vpmaddwd 8128(%rdx), %zmm22, %zmm26
// CHECK: vpmaddwd 8192(%rdx), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0x00,0x20,0x00,0x00]
vpmaddwd 8192(%rdx), %zmm22, %zmm26
// CHECK: vpmaddwd -8192(%rdx), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x80]
vpmaddwd -8192(%rdx), %zmm22, %zmm26
// CHECK: vpmaddwd -8256(%rdx), %zmm22, %zmm26
// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0xc0,0xdf,0xff,0xff]
vpmaddwd -8256(%rdx), %zmm22, %zmm26

View File

@ -6799,3 +6799,291 @@
// CHECK: encoding: [0x62,0x62,0x5d,0x20,0x0b,0xa2,0xe0,0xef,0xff,0xff]
vpmulhrsw -4128(%rdx), %ymm20, %ymm28
// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28
// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xe4]
vpmaddubsw %xmm20, %xmm21, %xmm28
// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6}
// CHECK: encoding: [0x62,0x22,0x55,0x06,0x04,0xe4]
vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6}
// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z}
// CHECK: encoding: [0x62,0x22,0x55,0x86,0x04,0xe4]
vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z}
// CHECK: vpmaddubsw (%rcx), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x21]
vpmaddubsw (%rcx), %xmm21, %xmm28
// CHECK: vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xa4,0xf0,0x23,0x01,0x00,0x00]
vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28
// CHECK: vpmaddubsw 2032(%rdx), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x7f]
vpmaddubsw 2032(%rdx), %xmm21, %xmm28
// CHECK: vpmaddubsw 2048(%rdx), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0x00,0x08,0x00,0x00]
vpmaddubsw 2048(%rdx), %xmm21, %xmm28
// CHECK: vpmaddubsw -2048(%rdx), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x80]
vpmaddubsw -2048(%rdx), %xmm21, %xmm28
// CHECK: vpmaddubsw -2064(%rdx), %xmm21, %xmm28
// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0xf0,0xf7,0xff,0xff]
vpmaddubsw -2064(%rdx), %xmm21, %xmm28
// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30
// CHECK: encoding: [0x62,0x02,0x2d,0x20,0x04,0xf2]
vpmaddubsw %ymm26, %ymm26, %ymm30
// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5}
// CHECK: encoding: [0x62,0x02,0x2d,0x25,0x04,0xf2]
vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5}
// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z}
// CHECK: encoding: [0x62,0x02,0x2d,0xa5,0x04,0xf2]
vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z}
// CHECK: vpmaddubsw (%rcx), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x31]
vpmaddubsw (%rcx), %ymm26, %ymm30
// CHECK: vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x22,0x2d,0x20,0x04,0xb4,0xf0,0x23,0x01,0x00,0x00]
vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30
// CHECK: vpmaddubsw 4064(%rdx), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x7f]
vpmaddubsw 4064(%rdx), %ymm26, %ymm30
// CHECK: vpmaddubsw 4096(%rdx), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0x00,0x10,0x00,0x00]
vpmaddubsw 4096(%rdx), %ymm26, %ymm30
// CHECK: vpmaddubsw -4096(%rdx), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x80]
vpmaddubsw -4096(%rdx), %ymm26, %ymm30
// CHECK: vpmaddubsw -4128(%rdx), %ymm26, %ymm30
// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0xe0,0xef,0xff,0xff]
vpmaddubsw -4128(%rdx), %ymm26, %ymm30
// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17
// CHECK: encoding: [0x62,0x81,0x3d,0x00,0xf5,0xcc]
vpmaddwd %xmm28, %xmm24, %xmm17
// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1}
// CHECK: encoding: [0x62,0x81,0x3d,0x01,0xf5,0xcc]
vpmaddwd %xmm28, %xmm24, %xmm17 {%k1}
// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z}
// CHECK: encoding: [0x62,0x81,0x3d,0x81,0xf5,0xcc]
vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z}
// CHECK: vpmaddwd (%rcx), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x09]
vpmaddwd (%rcx), %xmm24, %xmm17
// CHECK: vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xa1,0x3d,0x00,0xf5,0x8c,0xf0,0x23,0x01,0x00,0x00]
vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17
// CHECK: vpmaddwd 2032(%rdx), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x7f]
vpmaddwd 2032(%rdx), %xmm24, %xmm17
// CHECK: vpmaddwd 2048(%rdx), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0x00,0x08,0x00,0x00]
vpmaddwd 2048(%rdx), %xmm24, %xmm17
// CHECK: vpmaddwd -2048(%rdx), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x80]
vpmaddwd -2048(%rdx), %xmm24, %xmm17
// CHECK: vpmaddwd -2064(%rdx), %xmm24, %xmm17
// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0xf0,0xf7,0xff,0xff]
vpmaddwd -2064(%rdx), %xmm24, %xmm17
// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24
// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0xc3]
vpmaddwd %ymm19, %ymm23, %ymm24
// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4}
// CHECK: encoding: [0x62,0x21,0x45,0x24,0xf5,0xc3]
vpmaddwd %ymm19, %ymm23, %ymm24 {%k4}
// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z}
// CHECK: encoding: [0x62,0x21,0x45,0xa4,0xf5,0xc3]
vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z}
// CHECK: vpmaddwd (%rcx), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x01]
vpmaddwd (%rcx), %ymm23, %ymm24
// CHECK: vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0x84,0xf0,0x23,0x01,0x00,0x00]
vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24
// CHECK: vpmaddwd 4064(%rdx), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x7f]
vpmaddwd 4064(%rdx), %ymm23, %ymm24
// CHECK: vpmaddwd 4096(%rdx), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0x00,0x10,0x00,0x00]
vpmaddwd 4096(%rdx), %ymm23, %ymm24
// CHECK: vpmaddwd -4096(%rdx), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x80]
vpmaddwd -4096(%rdx), %ymm23, %ymm24
// CHECK: vpmaddwd -4128(%rdx), %ymm23, %ymm24
// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0xe0,0xef,0xff,0xff]
vpmaddwd -4128(%rdx), %ymm23, %ymm24
// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19
// CHECK: encoding: [0x62,0x82,0x45,0x00,0x04,0xd9]
vpmaddubsw %xmm25, %xmm23, %xmm19
// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2}
// CHECK: encoding: [0x62,0x82,0x45,0x02,0x04,0xd9]
vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2}
// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z}
// CHECK: encoding: [0x62,0x82,0x45,0x82,0x04,0xd9]
vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z}
// CHECK: vpmaddubsw (%rcx), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x19]
vpmaddubsw (%rcx), %xmm23, %xmm19
// CHECK: vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xa2,0x45,0x00,0x04,0x9c,0xf0,0x34,0x12,0x00,0x00]
vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19
// CHECK: vpmaddubsw 2032(%rdx), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x7f]
vpmaddubsw 2032(%rdx), %xmm23, %xmm19
// CHECK: vpmaddubsw 2048(%rdx), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0x00,0x08,0x00,0x00]
vpmaddubsw 2048(%rdx), %xmm23, %xmm19
// CHECK: vpmaddubsw -2048(%rdx), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x80]
vpmaddubsw -2048(%rdx), %xmm23, %xmm19
// CHECK: vpmaddubsw -2064(%rdx), %xmm23, %xmm19
// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0xf0,0xf7,0xff,0xff]
vpmaddubsw -2064(%rdx), %xmm23, %xmm19
// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17
// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0xce]
vpmaddubsw %ymm22, %ymm19, %ymm17
// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7}
// CHECK: encoding: [0x62,0xa2,0x65,0x27,0x04,0xce]
vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7}
// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z}
// CHECK: encoding: [0x62,0xa2,0x65,0xa7,0x04,0xce]
vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z}
// CHECK: vpmaddubsw (%rcx), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x09]
vpmaddubsw (%rcx), %ymm19, %ymm17
// CHECK: vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0x8c,0xf0,0x34,0x12,0x00,0x00]
vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17
// CHECK: vpmaddubsw 4064(%rdx), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x7f]
vpmaddubsw 4064(%rdx), %ymm19, %ymm17
// CHECK: vpmaddubsw 4096(%rdx), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0x00,0x10,0x00,0x00]
vpmaddubsw 4096(%rdx), %ymm19, %ymm17
// CHECK: vpmaddubsw -4096(%rdx), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x80]
vpmaddubsw -4096(%rdx), %ymm19, %ymm17
// CHECK: vpmaddubsw -4128(%rdx), %ymm19, %ymm17
// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0xe0,0xef,0xff,0xff]
vpmaddubsw -4128(%rdx), %ymm19, %ymm17
// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23
// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xfc]
vpmaddwd %xmm20, %xmm22, %xmm23
// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3}
// CHECK: encoding: [0x62,0xa1,0x4d,0x03,0xf5,0xfc]
vpmaddwd %xmm20, %xmm22, %xmm23 {%k3}
// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z}
// CHECK: encoding: [0x62,0xa1,0x4d,0x83,0xf5,0xfc]
vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z}
// CHECK: vpmaddwd (%rcx), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x39]
vpmaddwd (%rcx), %xmm22, %xmm23
// CHECK: vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xbc,0xf0,0x34,0x12,0x00,0x00]
vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23
// CHECK: vpmaddwd 2032(%rdx), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x7f]
vpmaddwd 2032(%rdx), %xmm22, %xmm23
// CHECK: vpmaddwd 2048(%rdx), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0x00,0x08,0x00,0x00]
vpmaddwd 2048(%rdx), %xmm22, %xmm23
// CHECK: vpmaddwd -2048(%rdx), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x80]
vpmaddwd -2048(%rdx), %xmm22, %xmm23
// CHECK: vpmaddwd -2064(%rdx), %xmm22, %xmm23
// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0xf0,0xf7,0xff,0xff]
vpmaddwd -2064(%rdx), %xmm22, %xmm23
// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19
// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0xd9]
vpmaddwd %ymm17, %ymm20, %ymm19
// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2}
// CHECK: encoding: [0x62,0xa1,0x5d,0x22,0xf5,0xd9]
vpmaddwd %ymm17, %ymm20, %ymm19 {%k2}
// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z}
// CHECK: encoding: [0x62,0xa1,0x5d,0xa2,0xf5,0xd9]
vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z}
// CHECK: vpmaddwd (%rcx), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x19]
vpmaddwd (%rcx), %ymm20, %ymm19
// CHECK: vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0x9c,0xf0,0x34,0x12,0x00,0x00]
vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19
// CHECK: vpmaddwd 4064(%rdx), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x7f]
vpmaddwd 4064(%rdx), %ymm20, %ymm19
// CHECK: vpmaddwd 4096(%rdx), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0x00,0x10,0x00,0x00]
vpmaddwd 4096(%rdx), %ymm20, %ymm19
// CHECK: vpmaddwd -4096(%rdx), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x80]
vpmaddwd -4096(%rdx), %ymm20, %ymm19
// CHECK: vpmaddwd -4128(%rdx), %ymm20, %ymm19
// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0xe0,0xef,0xff,0xff]
vpmaddwd -4128(%rdx), %ymm20, %ymm19