Revise alignment checking/calculation on 256-bit unaligned memory access

- It's still considered aligned when the specified alignment is larger
  than the natural alignment;
- The new alignment for the high 128-bit vector should be min(16,
  alignment) as the pointer is advanced by 16, a power-of-2 offset.

llvm-svn: 177947
This commit is contained in:
Michael Liao 2013-03-25 23:50:10 +00:00
parent f759061908
commit 5fbcd81793
2 changed files with 29 additions and 8 deletions

View File

@ -16639,11 +16639,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits(); unsigned RegSz = RegVT.getSizeInBits();
// On Sandybridge unaligned 256bit loads are inefficient.
ISD::LoadExtType Ext = Ld->getExtensionType(); ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment(); unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8; bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
// On Sandybridge unaligned 256bit loads are inefficient.
if (RegVT.is256BitVector() && !Subtarget->hasInt256() && if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements(); unsigned NumElems = RegVT.getVectorNumElements();
@ -16663,7 +16662,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
Ld->getPointerInfo(), Ld->isVolatile(), Ld->getPointerInfo(), Ld->isVolatile(),
Ld->isNonTemporal(), Ld->isInvariant(), Ld->isNonTemporal(), Ld->isInvariant(),
std::max(Alignment/2U, 1U)); std::min(16U, Alignment));
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load1.getValue(1),
Load2.getValue(1)); Load2.getValue(1));
@ -16834,13 +16833,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
DebugLoc dl = St->getDebugLoc(); DebugLoc dl = St->getDebugLoc();
SDValue StoredVal = St->getOperand(1); SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
// If we are saving a concatenation of two XMM registers, perform two stores. // If we are saving a concatenation of two XMM registers, perform two stores.
// On Sandy Bridge, 256-bit memory operations are executed by two // On Sandy Bridge, 256-bit memory operations are executed by two
// 128-bit ports. However, on Haswell it is better to issue a single 256-bit // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
// memory operation. // memory operation.
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
if (VT.is256BitVector() && !Subtarget->hasInt256() && if (VT.is256BitVector() && !Subtarget->hasInt256() &&
StVT == VT && !IsAligned) { StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements(); unsigned NumElems = VT.getVectorNumElements();
@ -16860,7 +16859,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
St->getPointerInfo(), St->isVolatile(), St->getPointerInfo(), St->isVolatile(),
St->isNonTemporal(), St->isNonTemporal(),
std::max(Alignment/2U, 1U)); std::min(16U, Alignment));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
} }

View File

@ -81,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind {
; CHECK: _double_save ; CHECK: _double_save
; CHECK-NOT: vinsertf128 $1 ; CHECK-NOT: vinsertf128 $1
; CHECK-NOT: vinsertf128 $0 ; CHECK-NOT: vinsertf128 $0
; CHECK: vmovups %xmm ; CHECK: vmovaps %xmm
; CHECK: vmovaps %xmm ; CHECK: vmovaps %xmm
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
entry: entry:
@ -127,3 +127,25 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
store <8 x i32> %x, <8 x i32>* %ret, align 1 store <8 x i32> %x, <8 x i32>* %ret, align 1
ret void ret void
} }
; CHECK: add4i64a64
; CHECK: vmovaps ({{.*}}), %ymm{{.*}}
; CHECK: vmovaps %ymm{{.*}}, ({{.*}})
define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
%b = load <4 x i64>* %bp, align 64
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 64
ret void
}
; CHECK: add4i64a16
; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
%b = load <4 x i64>* %bp, align 16
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 16
ret void
}