[AVX-512] Add support for execution domain switching masked logical ops between floating point and integer domain.
This switches PS<->D and PD<->Q. llvm-svn: 278097
This commit is contained in:
parent
edc521f192
commit
92a4ff1294
|
@ -7474,6 +7474,108 @@ static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
|
|||
{ X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
|
||||
};
|
||||
|
||||
static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
|
||||
// Two integer columns for 64-bit and 32-bit elements.
|
||||
//PackedSingle PackedDouble
|
||||
//PackedInt PackedInt
|
||||
{ X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
|
||||
X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
|
||||
{ X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
|
||||
X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
|
||||
{ X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
|
||||
X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
|
||||
{ X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
|
||||
X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
|
||||
{ X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
|
||||
X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
|
||||
{ X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
|
||||
X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
|
||||
{ X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
|
||||
X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
|
||||
{ X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
|
||||
X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
|
||||
{ X86::VORPSZ128rmk, X86::VORPDZ128rmk,
|
||||
X86::VPORQZ128rmk, X86::VPORDZ128rmk },
|
||||
{ X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
|
||||
X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
|
||||
{ X86::VORPSZ128rrk, X86::VORPDZ128rrk,
|
||||
X86::VPORQZ128rrk, X86::VPORDZ128rrk },
|
||||
{ X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
|
||||
X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
|
||||
{ X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
|
||||
X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
|
||||
{ X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
|
||||
X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
|
||||
{ X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
|
||||
X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
|
||||
{ X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
|
||||
X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
|
||||
{ X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
|
||||
X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
|
||||
{ X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
|
||||
X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
|
||||
{ X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
|
||||
X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
|
||||
{ X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
|
||||
X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
|
||||
{ X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
|
||||
X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
|
||||
{ X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
|
||||
X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
|
||||
{ X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
|
||||
X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
|
||||
{ X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
|
||||
X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
|
||||
{ X86::VORPSZ256rmk, X86::VORPDZ256rmk,
|
||||
X86::VPORQZ256rmk, X86::VPORDZ256rmk },
|
||||
{ X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
|
||||
X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
|
||||
{ X86::VORPSZ256rrk, X86::VORPDZ256rrk,
|
||||
X86::VPORQZ256rrk, X86::VPORDZ256rrk },
|
||||
{ X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
|
||||
X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
|
||||
{ X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
|
||||
X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
|
||||
{ X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
|
||||
X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
|
||||
{ X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
|
||||
X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
|
||||
{ X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
|
||||
X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
|
||||
{ X86::VANDNPSZrmk, X86::VANDNPDZrmk,
|
||||
X86::VPANDNQZrmk, X86::VPANDNDZrmk },
|
||||
{ X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
|
||||
X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
|
||||
{ X86::VANDNPSZrrk, X86::VANDNPDZrrk,
|
||||
X86::VPANDNQZrrk, X86::VPANDNDZrrk },
|
||||
{ X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
|
||||
X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
|
||||
{ X86::VANDPSZrmk, X86::VANDPDZrmk,
|
||||
X86::VPANDQZrmk, X86::VPANDDZrmk },
|
||||
{ X86::VANDPSZrmkz, X86::VANDPDZrmkz,
|
||||
X86::VPANDQZrmkz, X86::VPANDDZrmkz },
|
||||
{ X86::VANDPSZrrk, X86::VANDPDZrrk,
|
||||
X86::VPANDQZrrk, X86::VPANDDZrrk },
|
||||
{ X86::VANDPSZrrkz, X86::VANDPDZrrkz,
|
||||
X86::VPANDQZrrkz, X86::VPANDDZrrkz },
|
||||
{ X86::VORPSZrmk, X86::VORPDZrmk,
|
||||
X86::VPORQZrmk, X86::VPORDZrmk },
|
||||
{ X86::VORPSZrmkz, X86::VORPDZrmkz,
|
||||
X86::VPORQZrmkz, X86::VPORDZrmkz },
|
||||
{ X86::VORPSZrrk, X86::VORPDZrrk,
|
||||
X86::VPORQZrrk, X86::VPORDZrrk },
|
||||
{ X86::VORPSZrrkz, X86::VORPDZrrkz,
|
||||
X86::VPORQZrrkz, X86::VPORDZrrkz },
|
||||
{ X86::VXORPSZrmk, X86::VXORPDZrmk,
|
||||
X86::VPXORQZrmk, X86::VPXORDZrmk },
|
||||
{ X86::VXORPSZrmkz, X86::VXORPDZrmkz,
|
||||
X86::VPXORQZrmkz, X86::VPXORDZrmkz },
|
||||
{ X86::VXORPSZrrk, X86::VXORPDZrrk,
|
||||
X86::VPXORQZrrk, X86::VPXORDZrrk },
|
||||
{ X86::VXORPSZrrkz, X86::VXORPDZrrkz,
|
||||
X86::VPXORQZrrkz, X86::VPXORDZrrkz },
|
||||
};
|
||||
|
||||
// FIXME: Some shuffle and unpack instructions have equivalents in different
|
||||
// domains, but they require a bit more work than just switching opcodes.
|
||||
|
||||
|
@ -7507,18 +7609,35 @@ static const uint16_t *lookupAVX512DQ(unsigned opcode, unsigned domain) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static const uint16_t *lookupAVX512DQMasked(unsigned opcode, unsigned domain) {
|
||||
// If this is the integer domain make sure to check both integer columns.
|
||||
for (const uint16_t (&Row)[4] : ReplaceableInstrsAVX512DQMasked)
|
||||
if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
|
||||
return Row;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::pair<uint16_t, uint16_t>
|
||||
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
|
||||
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
||||
unsigned opcode = MI.getOpcode();
|
||||
uint16_t validDomains = 0;
|
||||
if (domain && lookup(MI.getOpcode(), domain))
|
||||
validDomains = 0xe;
|
||||
else if (domain && lookupAVX2(MI.getOpcode(), domain))
|
||||
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
|
||||
else if (domain && lookupAVX512(MI.getOpcode(), domain))
|
||||
validDomains = 0xe;
|
||||
else if (domain && lookupAVX512DQ(MI.getOpcode(), domain))
|
||||
validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
|
||||
if (domain) {
|
||||
if (lookup(MI.getOpcode(), domain)) {
|
||||
validDomains = 0xe;
|
||||
} else if (lookupAVX2(opcode, domain)) {
|
||||
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
|
||||
} else if (lookupAVX512(opcode, domain)) {
|
||||
validDomains = 0xe;
|
||||
} else if (lookupAVX512DQ(opcode, domain)) {
|
||||
validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
|
||||
} else if (const uint16_t *table = lookupAVX512DQMasked(opcode, domain)) {
|
||||
if (domain == 1 || (domain == 3 && table[3] == opcode))
|
||||
validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
|
||||
else
|
||||
validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
|
||||
}
|
||||
}
|
||||
return std::make_pair(domain, validDomains);
|
||||
}
|
||||
|
||||
|
@ -7540,10 +7659,17 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
|
|||
Domain = 4;
|
||||
}
|
||||
if (!table) { // try the AVX512DQ table
|
||||
assert((Subtarget.hasDQI() || Domain >=3) && "Requires AVX-512DQ");
|
||||
assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
|
||||
table = lookupAVX512DQ(MI.getOpcode(), dom);
|
||||
// Don't change integer Q instructions to D instructions.
|
||||
if (table && Domain == 3 && table[3] == MI.getOpcode())
|
||||
// Don't change integer Q instructions to D instructions and
|
||||
// use D intructions if we started with a PS instruction.
|
||||
if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
|
||||
Domain = 4;
|
||||
}
|
||||
if (!table) { // try the AVX512DQMasked table
|
||||
assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
|
||||
table = lookupAVX512DQMasked(MI.getOpcode(), dom);
|
||||
if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
|
||||
Domain = 4;
|
||||
}
|
||||
assert(table && "Cannot change domain");
|
||||
|
|
|
@ -278,3 +278,153 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
|
|||
%res = xor <32 x i16> %a, %b
|
||||
ret <32 x i16> %res
|
||||
}
|
||||
|
||||
define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
|
||||
; KNL-LABEL: masked_and_v16f32:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_and_v16f32:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovw %edi, %k1
|
||||
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <16 x float> %a to <16 x i32>
|
||||
%b1 = bitcast <16 x float> %b to <16 x i32>
|
||||
%passThru1 = bitcast <16 x float> %passThru to <16 x i32>
|
||||
%mask1 = bitcast i16 %mask to <16 x i1>
|
||||
%op = and <16 x i32> %a1, %b1
|
||||
%select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
|
||||
%cast = bitcast <16 x i32> %select to <16 x float>
|
||||
%add = fadd <16 x float> %c, %cast
|
||||
ret <16 x float> %add
|
||||
}
|
||||
|
||||
define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
|
||||
; KNL-LABEL: masked_or_v16f32:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_or_v16f32:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovw %edi, %k1
|
||||
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <16 x float> %a to <16 x i32>
|
||||
%b1 = bitcast <16 x float> %b to <16 x i32>
|
||||
%passThru1 = bitcast <16 x float> %passThru to <16 x i32>
|
||||
%mask1 = bitcast i16 %mask to <16 x i1>
|
||||
%op = and <16 x i32> %a1, %b1
|
||||
%select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
|
||||
%cast = bitcast <16 x i32> %select to <16 x float>
|
||||
%add = fadd <16 x float> %c, %cast
|
||||
ret <16 x float> %add
|
||||
}
|
||||
|
||||
define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
|
||||
; KNL-LABEL: masked_xor_v16f32:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_xor_v16f32:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovw %edi, %k1
|
||||
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <16 x float> %a to <16 x i32>
|
||||
%b1 = bitcast <16 x float> %b to <16 x i32>
|
||||
%passThru1 = bitcast <16 x float> %passThru to <16 x i32>
|
||||
%mask1 = bitcast i16 %mask to <16 x i1>
|
||||
%op = and <16 x i32> %a1, %b1
|
||||
%select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
|
||||
%cast = bitcast <16 x i32> %select to <16 x float>
|
||||
%add = fadd <16 x float> %c, %cast
|
||||
ret <16 x float> %add
|
||||
}
|
||||
|
||||
define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
|
||||
; KNL-LABEL: masked_and_v8f64:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_and_v8f64:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovb %edi, %k1
|
||||
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <8 x double> %a to <8 x i64>
|
||||
%b1 = bitcast <8 x double> %b to <8 x i64>
|
||||
%passThru1 = bitcast <8 x double> %passThru to <8 x i64>
|
||||
%mask1 = bitcast i8 %mask to <8 x i1>
|
||||
%op = and <8 x i64> %a1, %b1
|
||||
%select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
|
||||
%cast = bitcast <8 x i64> %select to <8 x double>
|
||||
%add = fadd <8 x double> %c, %cast
|
||||
ret <8 x double> %add
|
||||
}
|
||||
|
||||
define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
|
||||
; KNL-LABEL: masked_or_v8f64:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_or_v8f64:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovb %edi, %k1
|
||||
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <8 x double> %a to <8 x i64>
|
||||
%b1 = bitcast <8 x double> %b to <8 x i64>
|
||||
%passThru1 = bitcast <8 x double> %passThru to <8 x i64>
|
||||
%mask1 = bitcast i8 %mask to <8 x i1>
|
||||
%op = and <8 x i64> %a1, %b1
|
||||
%select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
|
||||
%cast = bitcast <8 x i64> %select to <8 x double>
|
||||
%add = fadd <8 x double> %c, %cast
|
||||
ret <8 x double> %add
|
||||
}
|
||||
|
||||
define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
|
||||
; KNL-LABEL: masked_xor_v8f64:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: kmovw %edi, %k1
|
||||
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: masked_xor_v8f64:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: kmovb %edi, %k1
|
||||
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
|
||||
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
|
||||
; SKX-NEXT: retq
|
||||
%a1 = bitcast <8 x double> %a to <8 x i64>
|
||||
%b1 = bitcast <8 x double> %b to <8 x i64>
|
||||
%passThru1 = bitcast <8 x double> %passThru to <8 x i64>
|
||||
%mask1 = bitcast i8 %mask to <8 x i1>
|
||||
%op = and <8 x i64> %a1, %b1
|
||||
%select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
|
||||
%cast = bitcast <8 x i64> %select to <8 x double>
|
||||
%add = fadd <8 x double> %c, %cast
|
||||
ret <8 x double> %add
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue