Optimized load + SIGN_EXTEND patterns in the X86 backend.

llvm-svn: 170506
This commit is contained in:
Elena Demikhovsky 2012-12-19 07:50:20 +00:00
parent 33360d8ae9
commit 14a4af0e66
6 changed files with 202 additions and 7 deletions

View File

@ -5235,6 +5235,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
LN0->getAlignment());
CombineTo(N, ExtLoad);
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
AddToWorkList(ExtLoad.getNode());
return SDValue(N, 0); // Return N so it doesn't get rechecked!
}
// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use

View File

@ -15929,10 +15929,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// If this is a vector EXT Load then attempt to optimize it using a
// shuffle. We need SSSE3 shuffles.
// SEXT loads are suppoted starting SSE41.
// We generate X86ISD::VSEXT for them.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
if (RegVT.isVector() && RegVT.isInteger() &&
Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
(Ext == ISD::EXTLOAD && Subtarget->hasSSSE3() ||
Ext == ISD::SEXTLOAD && Subtarget->hasSSE41())){
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
@ -15941,6 +15944,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
return SDValue();
// All sizes must be a power of two.
if (!isPowerOf2_32(RegSz * MemSz * NumElems))
return SDValue();
@ -15964,16 +15970,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// Calculate the number of scalar loads that we need to perform
// in order to load our vector from memory.
unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
if (Ext == ISD::SEXTLOAD && NumLoads > 1)
return SDValue();
unsigned loadRegZize = RegSz;
if (Ext == ISD::SEXTLOAD && RegSz == 256)
loadRegZize /= 2;
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
RegSz/SclrLoadTy.getSizeInBits());
loadRegZize/SclrLoadTy.getSizeInBits());
// Represent the data using the same element type that is stored in
// memory. In practice, we ''widen'' MemVT.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
RegSz/MemVT.getScalarType().getSizeInBits());
EVT WideVecVT =
EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
loadRegZize/MemVT.getScalarType().getSizeInBits());
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type");
@ -16014,6 +16027,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
if (Ext == ISD::SEXTLOAD) {
SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
return DCI.CombineTo(N, Sext, TF, true);
}
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)

View File

@ -5842,6 +5842,31 @@ defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
let Predicates = [HasAVX2] in {
def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
(VPMOVSXWDYrm addr:$src)>;
def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
(VPMOVSXDQYrm addr:$src)>;
def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(VPMOVSXBDYrm addr:$src)>;
def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(VPMOVSXBDYrm addr:$src)>;
def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(VPMOVSXWQYrm addr:$src)>;
def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(VPMOVSXWQYrm addr:$src)>;
def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32
(scalar_to_vector (loadi32 addr:$src))))))),
(VPMOVSXBQYrm addr:$src)>;
}
let Predicates = [HasAVX] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
@ -5866,6 +5891,34 @@ let Predicates = [UseSSE41] in {
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
(PMOVZXBQrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(PMOVSXWDrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(PMOVSXWDrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
(scalar_to_vector (loadi32 addr:$src))))))),
(PMOVSXBDrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
(scalar_to_vector (loadi32 addr:$src))))))),
(PMOVSXWQrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
(scalar_to_vector (extloadi32i16 addr:$src))))))),
(PMOVSXBQrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(PMOVSXDQrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(PMOVSXDQrm addr:$src)>;
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(PMOVSXBWrm addr:$src)>;
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(PMOVSXBWrm addr:$src)>;
}
let Predicates = [HasAVX2] in {
@ -5926,6 +5979,35 @@ let Predicates = [HasAVX] in {
(VPMOVZXDQrm addr:$src)>;
def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
(VPMOVZXDQrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(VPMOVSXWDrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(VPMOVSXDQrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(VPMOVSXWDrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(VPMOVSXDQrm addr:$src)>;
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
(scalar_to_vector (loadi64 addr:$src))))))),
(VPMOVSXBWrm addr:$src)>;
def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
(scalar_to_vector (loadf64 addr:$src))))))),
(VPMOVSXBWrm addr:$src)>;
def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
(scalar_to_vector (loadi32 addr:$src))))))),
(VPMOVSXBDrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
(scalar_to_vector (loadi32 addr:$src))))))),
(VPMOVSXWQrm addr:$src)>;
def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
(scalar_to_vector (extloadi32i16 addr:$src))))))),
(VPMOVSXBQrm addr:$src)>;
}
let Predicates = [UseSSE41] in {

View File

@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: main
define i32 @main() nounwind uwtable {
entry:
; CHECK: movsbq j(%rip), %
; CHECK: movsbq i(%rip), %
; CHECK: pmovsxbq j(%rip), %
; CHECK: pmovsxbq i(%rip), %
%0 = load <2 x i8>* @i, align 8
%1 = load <2 x i8>* @j, align 8
%div = sdiv <2 x i8> %1, %0

View File

@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
;CHECK: sext_8i16_to_8i32
@ -15,3 +15,57 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
%B = sext <4 x i32> %A to <4 x i64>
ret <4 x i64>%B
}
; CHECK: load_sext_test1
; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
%X = load <4 x i16>* %ptr
%Y = sext <4 x i16> %X to <4 x i32>
ret <4 x i32>%Y
}
; CHECK: load_sext_test2
; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
%X = load <4 x i8>* %ptr
%Y = sext <4 x i8> %X to <4 x i32>
ret <4 x i32>%Y
}
; CHECK: load_sext_test3
; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
%X = load <2 x i8>* %ptr
%Y = sext <2 x i8> %X to <2 x i64>
ret <2 x i64>%Y
}
; CHECK: load_sext_test4
; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
%X = load <2 x i16>* %ptr
%Y = sext <2 x i16> %X to <2 x i64>
ret <2 x i64>%Y
}
; CHECK: load_sext_test5
; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
%X = load <2 x i32>* %ptr
%Y = sext <2 x i32> %X to <2 x i64>
ret <2 x i64>%Y
}
; CHECK: load_sext_test6
; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
; CHECK: ret
define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
%X = load <8 x i8>* %ptr
%Y = sext <8 x i8> %X to <8 x i16>
ret <8 x i16>%Y
}

View File

@ -63,6 +63,47 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
ret <8 x i32>%B
}
; CHECK: load_sext_test1
; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
; CHECK: ret
define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
%X = load <4 x i32>* %ptr
%Y = sext <4 x i32> %X to <4 x i64>
ret <4 x i64>%Y
}
; CHECK: load_sext_test2
; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}}
; CHECK: ret
define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
%X = load <4 x i8>* %ptr
%Y = sext <4 x i8> %X to <4 x i64>
ret <4 x i64>%Y
}
; CHECK: load_sext_test3
; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}}
; CHECK: ret
define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
%X = load <4 x i16>* %ptr
%Y = sext <4 x i16> %X to <4 x i64>
ret <4 x i64>%Y
}
; CHECK: load_sext_test4
; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}}
; CHECK: ret
define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
%X = load <8 x i16>* %ptr
%Y = sext <8 x i16> %X to <8 x i32>
ret <8 x i32>%Y
}
; CHECK: load_sext_test5
; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}}
; CHECK: ret
define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
%X = load <8 x i8>* %ptr
%Y = sext <8 x i8> %X to <8 x i32>
ret <8 x i32>%Y
}