Teach the integer-promotion rewrite strategy to be endianness aware.

Sorry for this being broken so long. =/

As part of this, switch all of the existing tests to be Little Endian,
which is the behavior I was asserting in them anyways! Add in a new
big-endian test that checks the interesting behavior there.

Another part of this is to tighten the rules abotu when we perform the
full-integer promotion. This logic now rejects cases where there fully
promoted integer is a non-multiple-of-8 bitwidth or cases where the
loads or stores touch bits which are in the allocated space of the
alloca but are not loaded or stored when accessing the integer. Sadly,
these aren't really observable today as the rest of the pass will
already ensure the invariants hold. However, the latter situation is
likely to become a potential concern in the future.

Thanks to Benjamin and Duncan for early review of this patch. I'm still
looking into whether there are further endianness issues, please let me
know if anyone sees BE failures persisting past this.

llvm-svn: 165219
This commit is contained in:
Chandler Carruth 2012-10-04 10:39:28 +00:00
parent b96a7b5aa5
commit 43c8b46deb
7 changed files with 141 additions and 13 deletions

View File

@ -1737,11 +1737,12 @@ static bool isVectorPromotionViable(const TargetData &TD,
/// that the result will be promotable, so we have an early test here.
static bool isIntegerPromotionViable(const TargetData &TD,
Type *AllocaTy,
uint64_t AllocBeginOffset,
AllocaPartitioning &P,
AllocaPartitioning::const_use_iterator I,
AllocaPartitioning::const_use_iterator E) {
IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy);
if (!Ty)
if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth())
return false;
// Check the uses to ensure the uses are (likely) promoteable integer uses.
@ -1752,6 +1753,12 @@ static bool isIntegerPromotionViable(const TargetData &TD,
for (; I != E; ++I) {
if (!I->U)
continue; // Skip dead use.
// We can't reasonably handle cases where the load or store extends past
// the end of the aloca's type and into its padding.
if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty))
return false;
if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
if (LI->isVolatile() || !LI->getType()->isIntegerTy())
return false;
@ -2130,7 +2137,7 @@ public:
"Only multiple-of-8 sized vector elements are viable");
ElementSize = VecTy->getScalarSizeInBits() / 8;
} else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(),
P, I, E)) {
NewAllocaBeginOffset, P, I, E)) {
IntPromotionTy = cast<IntegerType>(NewAI.getAllocatedType());
}
bool CanSROA = true;
@ -2218,8 +2225,15 @@ private:
getName(".load"));
assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t RelOffset = Offset - NewAllocaBeginOffset;
if (RelOffset)
V = IRB.CreateLShr(V, RelOffset*8, getName(".shift"));
assert(TD.getTypeStoreSize(TargetTy) + RelOffset <=
TD.getTypeStoreSize(IntPromotionTy) &&
"Element load outside of alloca store");
uint64_t ShAmt = 8*RelOffset;
if (TD.isBigEndian())
ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) -
TD.getTypeStoreSize(TargetTy) - RelOffset);
if (ShAmt)
V = IRB.CreateLShr(V, ShAmt, getName(".shift"));
if (TargetTy != IntPromotionTy) {
assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() &&
"Cannot extract to a larger integer!");
@ -2238,11 +2252,17 @@ private:
V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext"));
assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t RelOffset = Offset - NewAllocaBeginOffset;
if (RelOffset)
V = IRB.CreateShl(V, RelOffset*8, getName(".shift"));
assert(TD.getTypeStoreSize(Ty) + RelOffset <=
TD.getTypeStoreSize(IntPromotionTy) &&
"Element store outside of alloca store");
uint64_t ShAmt = 8*RelOffset;
if (TD.isBigEndian())
ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - TD.getTypeStoreSize(Ty)
- RelOffset);
if (ShAmt)
V = IRB.CreateShl(V, ShAmt, getName(".shift"));
APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth())
.shl(RelOffset*8);
APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth()).shl(ShAmt);
Value *Old = IRB.CreateAnd(IRB.CreateAlignedLoad(&NewAI,
NewAI.getAlignment(),
getName(".oldload")),

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -sroa -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)

View File

@ -1,7 +1,7 @@
; RUN: opt < %s -sroa -S | FileCheck %s
; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
declare void @llvm.lifetime.start(i64, i8* nocapture)
declare void @llvm.lifetime.end(i64, i8* nocapture)

View File

@ -0,0 +1,108 @@
; RUN: opt < %s -sroa -S | FileCheck %s
; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define i8 @test1() {
; We fully promote these to the i24 load or store size, resulting in just masks
; and other operations that instcombine will fold, but no alloca. Note this is
; the same as test12 in basictest.ll, but here we assert big-endian byte
; ordering.
;
; CHECK: @test1
entry:
%a = alloca [3 x i8]
%b = alloca [3 x i8]
; CHECK-NOT: alloca
%a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
store i8 0, i8* %a0ptr
%a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
store i8 0, i8* %a1ptr
%a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
store i8 0, i8* %a2ptr
%aiptr = bitcast [3 x i8]* %a to i24*
%ai = load i24* %aiptr
; CHCEK-NOT: store
; CHCEK-NOT: load
; CHECK: %[[mask0:.*]] = and i24 undef, 65535
; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], -256
%biptr = bitcast [3 x i8]* %b to i24*
store i24 %ai, i24* %biptr
%b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
%b0 = load i8* %b0ptr
%b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
%b1 = load i8* %b1ptr
%b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
%b2 = load i8* %b2ptr
; CHCEK-NOT: store
; CHCEK-NOT: load
; CHECK: %[[shift0:.*]] = lshr i24 %[[mask2]], 16
; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[mask2]] to i8
%bsum0 = add i8 %b0, %b1
%bsum1 = add i8 %bsum0, %b2
ret i8 %bsum1
; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
; CHECK-NEXT: ret i8 %[[sum1]]
}
define i64 @test2() {
; Test for various mixed sizes of integer loads and stores all getting
; promoted.
;
; CHECK: @test2
entry:
%a = alloca [7 x i8]
; CHECK-NOT: alloca
%a0ptr = getelementptr [7 x i8]* %a, i64 0, i32 0
%a1ptr = getelementptr [7 x i8]* %a, i64 0, i32 1
%a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
%a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
; CHCEK-NOT: store
; CHCEK-NOT: load
%a0i16ptr = bitcast i8* %a0ptr to i16*
store i16 1, i16* %a0i16ptr
; CHECK: %[[mask:.*]] = and i56 undef, 1099511627775
; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
%a1i4ptr = bitcast i8* %a1ptr to i4*
store i4 1, i4* %a1i4ptr
; CHECK: %[[mask:.*]] = and i56 %[[or]], -16492674416641
; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
store i8 1, i8* %a2ptr
; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1095216660481
; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 4294967296
%a3i24ptr = bitcast i8* %a3ptr to i24*
store i24 1, i24* %a3i24ptr
; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -4294967041
; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 256
%a2i40ptr = bitcast i8* %a2ptr to i40*
store i40 1, i40* %a2i40ptr
; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1099511627776
; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1
; CHCEK-NOT: store
; CHCEK-NOT: load
%aiptr = bitcast [7 x i8]* %a to i56*
%ai = load i56* %aiptr
%ret = zext i56 %ai to i64
ret i64 %ret
; CHECK: %[[ret:.*]] = zext i56 %[[or]] to i64
; CHECK-NEXT: ret i64 %[[ret]]
}

View File

@ -1,6 +1,6 @@
; RUN: opt < %s -sroa -S | FileCheck %s
; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define { i32, i32 } @test0(i32 %x, i32 %y) {
; CHECK: @test0

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -sroa -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define i32 @test1() {
; CHECK: @test1

View File

@ -1,5 +1,5 @@
; RUN: opt < %s -sroa -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
%S1 = type { i64, [42 x float] }