From 8a5267178208c4f857afe7b84f45f1d0678282c7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 27 Jul 2019 12:48:46 +0000 Subject: [PATCH] [SelectionDAG] Check for any recursion depth greater than or equal to limit instead of just equal the limit. If anything called the recursive isKnownNeverNaN/computeKnownBits/ComputeNumSignBits/SimplifyDemandedBits/SimplifyMultipleUseDemandedBits with an incorrect depth then we could continue to recurse if we'd already exceeded the depth limit. This replaces the limit check (Depth == 6) with a (Depth >= 6) to make sure that we don't circumvent it. This causes a couple of regressions as a mixture of calls (SimplifyMultipleUseDemandedBits + combineX86ShufflesRecursively) were calling with depths that were already over the limit. I've fixed SimplifyMultipleUseDemandedBits to not do this. combineX86ShufflesRecursively is trickier as we get a lot of regressions if we reduce its own limit from 8 to 6 (it also starts at Depth == 1 instead of Depth == 0 like the others....) - I'll see what I can do in future patches. llvm-svn: 367171 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 4 +- llvm/test/CodeGen/AArch64/bitfield-insert.ll | 3 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 908 +++++++------ llvm/test/CodeGen/AMDGPU/idot8u.ll | 1201 ++++++++++------- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 8 +- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 8 +- .../CodeGen/X86/vector-trunc-packus-widen.ll | 312 +++-- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 312 +++-- 9 files changed, 1543 insertions(+), 1219 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 18955489f3d9..362bc128e5aa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2425,7 +2425,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, return Known; } - if (Depth == 6) + if (Depth >= 6) return Known; // Limit search depth. KnownBits Known2; @@ -3415,7 +3415,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, return Val.getNumSignBits(); } - if (Depth == 6) + if (Depth >= 6) return 1; // Limit search depth. if (!DemandedElts) @@ -3964,7 +3964,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs()) return true; - if (Depth == 6) + if (Depth >= 6) return false; // Limit search depth. // TODO: Handle vectors. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1173beb9ac6e..92706e1f1169 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -569,7 +569,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, SDValue TargetLowering::SimplifyMultipleUseDemandedBits( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const { - if (Depth == 6) // Limit search depth. + if (Depth >= 6) // Limit search depth. return SDValue(); KnownBits LHSKnown, RHSKnown; @@ -707,7 +707,7 @@ bool TargetLowering::SimplifyDemandedBits( } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) { // Not demanding any bits/elts from Op. return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); - } else if (Depth == 6) { // Limit search depth. + } else if (Depth >= 6) { // Limit search depth. return false; } diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll index ef20b9e3061d..0753113fc111 100644 --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -265,7 +265,8 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) { define i32 @test_nouseful_bits(i8 %a, i32 %b) { ; CHECK-LABEL: test_nouseful_bits: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w8, lsl #8 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: bfi w8, w8, #8, #24 ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: bfi w9, w8, #8, #24 ; CHECK-NEXT: bfi w0, w9, #8, #24 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index c984b657cd7f..a60bf13fab70 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -331,38 +331,39 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s6, s4, 12 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s7, s4, 12 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40010 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s8 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40014 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v11, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v10, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v11, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -381,38 +382,39 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s7, s4, 12 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40010 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v10, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v11, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -431,38 +433,39 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008 ; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40010 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 +; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v10, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v11, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -493,25 +496,26 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 ; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40014 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s1, s8, v3 -; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -664,20 +668,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000 ; GFX8-NEXT: s_lshr_b32 s5, s1, 12 ; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 ; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 ; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s13 @@ -689,10 +694,10 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v10, s17 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 @@ -717,20 +722,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000 ; GFX9-NEXT: s_lshr_b32 s5, s1, 12 ; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 ; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 ; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s13 @@ -742,10 +748,10 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 @@ -770,20 +776,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 ; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12 ; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 ; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 ; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 ; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 ; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 @@ -795,10 +802,10 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 @@ -835,25 +842,26 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 ; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 -; GFX10-DL-NEXT: s_bfe_i32 s13, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s15, s5, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3 -; GFX10-DL-NEXT: s_ashr_i32 s0, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s1, s9, v3 -; GFX10-DL-NEXT: s_ashr_i32 s1, s5, 28 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s13, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s14, s15, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1601,25 +1609,25 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s0, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v8, s14, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s2, v8 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, s18 ; GFX7-NEXT: v_mov_b32_e32 v6, s17 ; GFX7-NEXT: v_mov_b32_e32 v7, s16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s11, v5, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v6, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v7, v0 @@ -1642,25 +1650,26 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 ; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s4, 4 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_lshr_b32 s1, s2, 8 +; GFX8-NEXT: s_lshr_b32 s5, s4, 4 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0 +; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6 +; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX8-NEXT: s_lshr_b32 s0, s2, 12 ; GFX8-NEXT: s_lshr_b32 s1, s4, 12 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16 ; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6 ; GFX8-NEXT: s_lshr_b32 s0, s2, 20 @@ -1674,26 +1683,26 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s6 +; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6 ; GFX8-NEXT: s_lshr_b32 s0, s2, 28 ; GFX8-NEXT: s_lshr_b32 s1, s4, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s0 +; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0 ; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v15, v16, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v17, v18, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1755,7 +1764,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1822,7 +1831,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1890,7 +1899,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2019,84 +2028,86 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_lshr_b32 s6, s4, 4 -; GFX8-NEXT: s_lshr_b32 s7, s4, 12 -; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s0 +; GFX8-NEXT: s_lshr_b32 s8, s1, 4 +; GFX8-NEXT: s_lshr_b32 s9, s1, 12 +; GFX8-NEXT: s_lshr_b32 s10, s1, 8 +; GFX8-NEXT: s_lshr_b32 s15, s2, 4 +; GFX8-NEXT: s_lshr_b32 s16, s2, 12 +; GFX8-NEXT: s_lshr_b32 s17, s2, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17 +; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX8-NEXT: s_lshr_b32 s4, s1, 20 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s6, s1, 28 +; GFX8-NEXT: s_lshr_b32 s7, s1, 24 +; GFX8-NEXT: s_lshr_b32 s11, s2, 20 +; GFX8-NEXT: s_lshr_b32 s12, s2, 16 +; GFX8-NEXT: s_lshr_b32 s13, s2, 28 +; GFX8-NEXT: s_lshr_b32 s14, s2, 24 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: s_lshr_b32 s5, s4, 20 -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: s_lshr_b32 s1, s2, 16 -; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: s_lshr_b32 s7, s2, 28 -; GFX8-NEXT: s_lshr_b32 s8, s4, 28 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s5 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: s_lshr_b32 s4, s4, 24 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s8 -; GFX8-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s12 +; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s11 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v10, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v5, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v8, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v4, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s0, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v6, v5, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2104,85 +2115,87 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 4 -; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-NEXT: s_lshr_b32 s6, s4, 4 -; GFX9-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-NEXT: s_lshr_b32 s8, s4, 8 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5 +; GFX9-NEXT: s_lshr_b32 s8, s0, 4 +; GFX9-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s0 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX9-NEXT: s_lshr_b32 s9, s0, 12 +; GFX9-NEXT: s_lshr_b32 s10, s0, 8 +; GFX9-NEXT: s_lshr_b32 s16, s1, 12 +; GFX9-NEXT: s_lshr_b32 s17, s1, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 +; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s4, s0, 20 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_lshr_b32 s11, s1, 20 +; GFX9-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 +; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s11 +; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-NEXT: s_lshr_b32 s0, s2, 20 -; GFX9-NEXT: s_lshr_b32 s5, s4, 20 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v10 -; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 28 -; GFX9-NEXT: s_lshr_b32 s8, s4, 28 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s8 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_mul_lo_u16_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v7, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v12 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX9-NEXT: v_or_b32_e32 v5, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2190,211 +2203,216 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 4 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5 +; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4 +; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s0 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 8 +; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s11 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20 -; GFX9-DL-NEXT: s_lshr_b32 s5, s4, 20 -; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v10 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v7 -; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 28 -; GFX9-DL-NEXT: s_lshr_b32 s8, s4, 28 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s8 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v14 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; GFX9-DL-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v5 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 8 -; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 4 -; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 8 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 +; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s5 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s16 +; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 +; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s18 +; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s17 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v19, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v10 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20 -; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 20 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: s_lshr_b32 s6, s5, 16 -; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 24 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s0 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 24 -; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 28 -; GFX10-DL-NEXT: v_and_b32_e32 v27, v15, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s6 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v19, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v22, v7, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v22, v11 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s8 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v14, v23, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v23, v27, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v16, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v17, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v12 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v31, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v13 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v19, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v14 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v23, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v19, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s7 +; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: v_and_b32_e32 v14, v11, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 20 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v14 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v10, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v23, v8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v13, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v14, v15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s12 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v35, 12, s13 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v27, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s14 +; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v9, v7 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v11, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v23, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX10-DL-NEXT: v_and_b32_sdwa v11, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v18, v35, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v19, v19, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v6, v13 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v35, 12, v18 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v19 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v15, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v31, 12, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v16, 12, v16 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v11, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v35, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v31, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v10, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v7, v6 +; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v9, v12 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v10, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v9 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index e39c3556b943..e80095b4899e 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -314,34 +314,35 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_and_b32 s1, s4, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 ; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -361,34 +362,35 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_and_b32 s1, s4, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 ; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -404,26 +406,81 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -559,34 +616,35 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_and_b32 s1, s4, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 ; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -606,34 +664,35 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_and_b32 s1, s4, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 ; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -649,26 +708,81 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -806,32 +920,35 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -854,32 +971,35 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -897,27 +1017,86 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1040,32 +1219,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1088,32 +1270,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1136,32 +1321,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1185,24 +1373,27 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 @@ -1812,22 +2003,22 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_u32_u24_e32 v8, s0, v1 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX7-NEXT: v_alignbit_b32 v4, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 @@ -1851,34 +2042,35 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_and_b32 s1, s4, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 ; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1929,7 +2121,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1984,7 +2176,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2035,7 +2227,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -2157,66 +2349,68 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_and_b32 s6, s4, 15 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s0 -; GFX8-NEXT: s_and_b32 s9, s2, 15 +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s16, s2, 15 +; GFX8-NEXT: s_bfe_u32 s17, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40014 +; GFX8-NEXT: s_lshr_b32 s6, s1, 28 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 +; GFX8-NEXT: s_lshr_b32 s13, s2, 28 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 +; GFX8-NEXT: s_and_b32 s9, s1, 15 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_mov_b32_e32 v5, s10 +; GFX8-NEXT: v_mov_b32_e32 v6, s16 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v3, s10, v3 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s9, v6 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s1, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40010 -; GFX8-NEXT: s_lshr_b32 s7, s2, 28 -; GFX8-NEXT: s_lshr_b32 s8, s4, 28 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_mov_b32_e32 v11, s6 +; GFX8-NEXT: v_mov_b32_e32 v12, s12 +; GFX8-NEXT: v_mov_b32_e32 v13, s11 +; GFX8-NEXT: v_mov_b32_e32 v14, s4 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_mul_u32_u24_e32 v7, s7, v9 +; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_e32 v9, s5, v12 +; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v6, s8 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v8 -; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s2, v4 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2224,59 +2418,61 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s9, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s1, 28 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: s_lshr_b32 s7, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-NEXT: v_mul_lo_u16_e32 v3, s4, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v5, s6, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, s8, v7 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v7, s0 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-NEXT: s_lshr_b32 s1, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v7, s5, v7 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s6, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s0, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v8 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX9-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2288,59 +2484,61 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s9, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s6, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s8, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s0 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s5, v7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s6, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s0, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v8, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v5, v8 +; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2353,58 +2551,60 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_and_b32 s7, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s9, s4, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s7, s9 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s9 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40014 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s8, s10 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 28 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 28 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s6, s8 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s1 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 +; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s6, s0 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 +; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s1, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v6 +; GFX10-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v6 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v14 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2508,32 +2708,35 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2556,32 +2759,35 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 @@ -2599,27 +2805,86 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 2cfed78522a3..de7959d6b5ab 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -348,11 +348,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -363,11 +363,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -435,6 +435,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero @@ -443,7 +444,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -458,6 +458,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero @@ -466,7 +467,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 6ab28a06666d..2a25efd50ff2 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -364,11 +364,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -379,11 +379,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -455,6 +455,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero @@ -464,7 +465,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -478,6 +478,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero @@ -487,7 +488,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll index 190c02d071e2..434464b498c1 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll @@ -50,9 +50,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 @@ -60,9 +61,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 @@ -104,9 +106,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 @@ -114,9 +117,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pand %xmm3, %xmm0 @@ -326,9 +330,10 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm3 @@ -336,20 +341,22 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 @@ -357,11 +364,12 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq @@ -427,9 +435,10 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: pand %xmm2, %xmm3 @@ -437,20 +446,22 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 ; SSSE3-NEXT: pand %xmm6, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm10, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm0, %xmm3 @@ -458,11 +469,12 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq @@ -649,13 +661,13 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm10, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm8, %xmm5 @@ -679,77 +691,81 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i16: @@ -759,13 +775,13 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pxor %xmm10, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm8, %xmm5 @@ -789,77 +805,81 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 22b8272b8f58..d9ee28aa117f 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -50,9 +50,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 @@ -60,9 +61,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 @@ -104,9 +106,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 @@ -114,9 +117,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pand %xmm3, %xmm0 @@ -326,9 +330,10 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm3 @@ -336,20 +341,22 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 @@ -357,11 +364,12 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSE2-NEXT: pxor %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq @@ -427,9 +435,10 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: pand %xmm2, %xmm3 @@ -437,20 +446,22 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 ; SSSE3-NEXT: pand %xmm6, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm10, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm0, %xmm3 @@ -458,11 +469,12 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm10, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq @@ -649,13 +661,13 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm10, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm8, %xmm5 @@ -679,77 +691,81 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i16: @@ -759,13 +775,13 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pxor %xmm10, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm8, %xmm5 @@ -789,77 +805,81 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: