From 7431211214d54d0cd8cc0d069447abd22c5da0cb Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 18 Jan 2013 23:10:30 +0000 Subject: [PATCH] On Sandybridge loading unaligned 256bits using two XMM loads (vmovups and vinsertf128) is faster than using a single vmovups instruction. llvm-svn: 172868 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +++++++++++++++++++++- llvm/test/CodeGen/X86/sandybridge-loads.ll | 21 ++++++++++++++ llvm/test/CodeGen/X86/v8i1-masks.ll | 2 +- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/sandybridge-loads.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2b6ff3602a48..73a1d2e00754 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16340,8 +16340,39 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); DebugLoc dl = Ld->getDebugLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); + unsigned Alignment = Ld->getAlignment(); + + // On Sandybridge unaligned 256bit loads are inefficient. + if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + !DCI.isBeforeLegalizeOps() && Alignment < 32 && + Ext == ISD::NON_EXTLOAD) { + unsigned NumElems = RegVT.getVectorNumElements(); + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + NumElems/2); + SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load1.getValue(1), + Load2.getValue(1)); + + SDValue NewVec = DAG.getUNDEF(RegVT); + NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + return DCI.CombineTo(N, NewVec, TF, true); + } // If this is a vector EXT Load then attempt to optimize it using a // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the @@ -16356,7 +16387,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); - unsigned RegSz = RegVT.getSizeInBits(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll new file mode 100644 index 000000000000..d85c32eaa7ec --- /dev/null +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + +;CHECK: wideloads +;CHECK: vmovaps +;CHECK: vinsertf128 +;CHECK: vmovups +;CHECK-NOT: vinsertf128 +;CHECK: ret + +define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { + %v0 = load <8 x float>* %a, align 16 ; <---- unaligned! + %v1 = load <8 x float>* %b, align 32 ; <---- aligned! + %m0 = fcmp olt <8 x float> %v1, %v0 + %v2 = load <8 x float>* %c, align 16 + %m1 = fcmp olt <8 x float> %v2, %v0 + %mand = and <8 x i1> %m1, %m0 + %r = zext <8 x i1> %mand to <8 x i32> + store <8 x i32> %r, <8 x i32>* undef, align 16 + ret void +} + diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index abb4b39bd622..ea231aff5b66 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s ;CHECK: and_masks -;CHECK: vmovups +;CHECK: vmovaps ;CHECK: vcmpltp ;CHECK: vcmpltp ;CHECK: vandps