On Sandybridge loading unaligned 256bits using two XMM loads (vmovups and vinsertf128) is faster than using a single vmovups instruction.

llvm-svn: 172868
This commit is contained in:
Nadav Rotem 2013-01-18 23:10:30 +00:00
parent 2affc1ea6d
commit 7431211214
3 changed files with 53 additions and 2 deletions

View File

@ -16340,8 +16340,39 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
EVT MemVT = Ld->getMemoryVT();
DebugLoc dl = Ld->getDebugLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
// On Sandybridge unaligned 256bit loads are inefficient.
if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
!DCI.isBeforeLegalizeOps() && Alignment < 32 &&
Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
SDValue Ptr = Ld->getBasePtr();
SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems/2);
SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->isInvariant(),
Alignment);
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->isInvariant(),
Alignment);
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1),
Load2.getValue(1));
SDValue NewVec = DAG.getUNDEF(RegVT);
NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
return DCI.CombineTo(N, NewVec, TF, true);
}
// If this is a vector EXT Load then attempt to optimize it using a
// shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
@ -16356,7 +16387,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned NumElems = RegVT.getVectorNumElements();
unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");

View File

@ -0,0 +1,21 @@
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
;CHECK: wideloads
;CHECK: vmovaps
;CHECK: vinsertf128
;CHECK: vmovups
;CHECK-NOT: vinsertf128
;CHECK: ret
define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
%v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
%v1 = load <8 x float>* %b, align 32 ; <---- aligned!
%m0 = fcmp olt <8 x float> %v1, %v0
%v2 = load <8 x float>* %c, align 16
%m1 = fcmp olt <8 x float> %v2, %v0
%mand = and <8 x i1> %m1, %m0
%r = zext <8 x i1> %mand to <8 x i32>
store <8 x i32> %r, <8 x i32>* undef, align 16
ret void
}

View File

@ -1,7 +1,7 @@
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
;CHECK: and_masks
;CHECK: vmovups
;CHECK: vmovaps
;CHECK: vcmpltp
;CHECK: vcmpltp
;CHECK: vandps