diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 50c4e8e304ac..e966d32614d2 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -7645,47 +7645,43 @@ let Predicates = [HasVLX] in { } multiclass avx512_cvtps2ph { + X86MemOperand x86memop> { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)), 0, 0>, - AVX512AIi8Base, Sched<[sched]>; + AVX512AIi8Base, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[sched.Folded, ReadAfterLd]>; + Sched<[WriteCvtF2FSt]>; def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, - EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; + EVEX_K, Sched<[WriteCvtF2FSt]>; } } -multiclass avx512_cvtps2ph_sae { +multiclass avx512_cvtps2ph_sae { let hasSideEffects = 0 in defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>, - EVEX_B, AVX512AIi8Base, Sched<[sched]>; + EVEX_B, AVX512AIi8Base, Sched<[WriteCvtF2F]>; } let Predicates = [HasAVX512] in { - defm VCVTPS2PHZ : avx512_cvtps2ph, - avx512_cvtps2ph_sae, EVEX, EVEX_V512, - EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ : avx512_cvtps2ph, + avx512_cvtps2ph_sae, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPS2PHZ256 : avx512_cvtps2ph, EVEX, EVEX_V256, - EVEX_CD8<32, CD8VH>; - defm VCVTPS2PHZ128 : avx512_cvtps2ph, EVEX, EVEX_V128, - EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ256 : avx512_cvtps2ph, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } def : Pat<(store (f64 (extractelt diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index aff0cc942044..1ef255dec7df 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7283,12 +7283,11 @@ multiclass f16c_ps2ph { "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; - let hasSideEffects = 0, mayStore = 1, - SchedRW = [WriteCvtF2FLd, WriteRMW] in + let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TAPD, VEX; + TAPD, VEX, Sched<[WriteCvtF2FSt]>; } let Predicates = [HasF16C, NoVLX] in { diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 191403bd13e9..662ba1898713 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -170,6 +170,12 @@ defm : BWWriteResPair; // Floating point vecto defm : BWWriteResPair; // Floating point vector blends. defm : BWWriteResPair; // Fp vector variable blends. +def : WriteRes { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} + // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -806,8 +812,7 @@ def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP16m", "IST_F32m", "IST_FP16m", "IST_FP32m", - "IST_FP64m", - "VCVTPS2PH(Y?)mr")>; + "IST_FP64m")>; def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> { let Latency = 4; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index cd7de24a770a..034f1d1b24b4 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -169,6 +169,12 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +def : WriteRes { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} + // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 5; } @@ -1823,13 +1829,6 @@ def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPD(Y?)mr", "VPMASKMOVD(Y?)mr", "VPMASKMOVQ(Y?)mr")>; -def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { - let Latency = 5; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>; - def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> { let Latency = 10; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 03b6f87a2ea2..b59d84fc0f1f 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -155,6 +155,7 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +def : WriteRes { let Latency = 4; } // Vector integer operations. def : WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 3e170d538a19..831f614461e6 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -167,6 +167,12 @@ defm : SKLWriteResPair; // Floating point vec defm : SKLWriteResPair; // Floating point vector blends. defm : SKLWriteResPair; // Fp vector variable blends. +def : WriteRes { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} + // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -1212,13 +1218,6 @@ def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156] } def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>; -def SKLWriteResGroup81 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort01]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKLWriteResGroup81], (instregex "VCVTPS2PHmr")>; - def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { let Latency = 6; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index e361029f51ab..820b0ca9c105 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -167,6 +167,12 @@ defm : SKXWriteResPair; // Floating point vec defm : SKXWriteResPair; // Floating point vector blends. defm : SKXWriteResPair; // Fp vector variable blends. +def : WriteRes { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} + // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -2340,13 +2346,6 @@ def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156] } def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>; -def SKXWriteResGroup85 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKXWriteResGroup85], (instregex "VCVTPS2PHmr")>; - def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { let Latency = 6; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 4f5c9e2b1b9c..4c869ac50a84 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -131,6 +131,7 @@ def WriteMMXMOVMSK : SchedWrite; defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion. +def WriteCvtF2FSt : SchedWrite; // // Float -> Float + store size conversion. // CRC32 instruction. defm WriteCRC32 : X86SchedWritePair; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index d4e704fa2c97..baf7463cfe51 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -227,6 +227,7 @@ defm : AtomWriteResPair; // NOTE defm : AtomWriteResPair; // Float -> Integer. defm : AtomWriteResPair; // Integer -> Float. defm : AtomWriteResPair; // Float -> Float size conversion. +def : WriteRes; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// // Vector integer operations. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 0052d2caa14a..8c4d6dad3b87 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -321,6 +321,7 @@ defm : JWriteResFpuPair; // NOTE: Doesn defm : JWriteResFpuPair; // Float -> Integer. defm : JWriteResFpuPair; // Integer -> Float. defm : JWriteResFpuPair; // Float -> Float size conversion. +def : WriteRes { let Latency = 4; } def JWriteCVTF2F : SchedWriteRes<[JFPU1, JSTC]> { let Latency = 7; @@ -491,11 +492,6 @@ def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; // F16C instructions. //////////////////////////////////////////////////////////////////////////////// -def JWriteCVT3St: SchedWriteRes<[JFPU1, JSTC, JSAGU]> { - let Latency = 4; -} -def : InstRW<[JWriteCVT3St], (instrs VCVTPS2PHmr)>; - def JWriteCVTPS2PHY: SchedWriteRes<[JFPU1, JSTC, JFPX]> { let Latency = 6; let ResourceCycles = [2, 2, 2]; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 3831b5e80707..55ee84fc9f0b 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -145,6 +145,7 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; +def : WriteRes; // Vector integer operations. def : WriteRes; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 765f53834fb7..2de60dec502d 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -211,6 +211,7 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; +def : WriteRes; // Vector integer operations which uses FPU units def : WriteRes; diff --git a/llvm/test/CodeGen/X86/f16c-schedule.ll b/llvm/test/CodeGen/X86/f16c-schedule.ll index d55547ef57fc..0c0f9d4b403b 100644 --- a/llvm/test/CodeGen/X86/f16c-schedule.ll +++ b/llvm/test/CodeGen/X86/f16c-schedule.ll @@ -125,13 +125,13 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> ; GENERIC-LABEL: test_vcvtps2ph_128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; GENERIC-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; IVY-LABEL: test_vcvtps2ph_128: ; IVY: # %bb.0: ; IVY-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] -; IVY-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; IVY-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00] ; IVY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_vcvtps2ph_128: @@ -175,14 +175,14 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> ; GENERIC-LABEL: test_vcvtps2ph_256: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; GENERIC-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; IVY-LABEL: test_vcvtps2ph_256: ; IVY: # %bb.0: ; IVY-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] -; IVY-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; IVY-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00] ; IVY-NEXT: vzeroupper # sched: [100:0.33] ; IVY-NEXT: retq # sched: [1:1.00] ; diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-f16c.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-f16c.s index e73e2472895e..0e8a30a1489b 100644 --- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-f16c.s +++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-f16c.s @@ -27,9 +27,9 @@ vcvtps2ph $0, %ymm0, (%rax) # CHECK-NEXT: 1 3 1.00 vcvtph2ps %xmm0, %ymm2 # CHECK-NEXT: 2 8 1.00 * vcvtph2ps (%rax), %ymm2 # CHECK-NEXT: 1 3 1.00 vcvtps2ph $0, %xmm0, %xmm2 -# CHECK-NEXT: 3 8 1.00 * vcvtps2ph $0, %xmm0, (%rax) +# CHECK-NEXT: 1 4 1.00 * vcvtps2ph $0, %xmm0, (%rax) # CHECK-NEXT: 1 3 1.00 vcvtps2ph $0, %ymm0, %xmm2 -# CHECK-NEXT: 3 8 1.00 * vcvtps2ph $0, %ymm0, (%rax) +# CHECK-NEXT: 1 4 1.00 * vcvtps2ph $0, %ymm0, (%rax) # CHECK: Resources: # CHECK-NEXT: [0] - SBDivider @@ -43,7 +43,7 @@ vcvtps2ph $0, %ymm0, (%rax) # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - - - 8.00 2.00 - 3.00 3.00 +# CHECK-NEXT: - - - 8.00 2.00 - 2.00 2.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -52,7 +52,7 @@ vcvtps2ph $0, %ymm0, (%rax) # CHECK-NEXT: - - - 1.00 - - - - vcvtph2ps %xmm0, %ymm2 # CHECK-NEXT: - - - 1.00 - - 0.50 0.50 vcvtph2ps (%rax), %ymm2 # CHECK-NEXT: - - - 1.00 - - - - vcvtps2ph $0, %xmm0, %xmm2 -# CHECK-NEXT: - - - 1.00 1.00 - 1.00 1.00 vcvtps2ph $0, %xmm0, (%rax) +# CHECK-NEXT: - - - 1.00 1.00 - 0.50 0.50 vcvtps2ph $0, %xmm0, (%rax) # CHECK-NEXT: - - - 1.00 - - - - vcvtps2ph $0, %ymm0, %xmm2 -# CHECK-NEXT: - - - 1.00 1.00 - 1.00 1.00 vcvtps2ph $0, %ymm0, (%rax) +# CHECK-NEXT: - - - 1.00 1.00 - 0.50 0.50 vcvtps2ph $0, %ymm0, (%rax)