diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 201d0f07c217..1d56a6e8aaa0 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -278,9 +278,10 @@ static bool isShortenableAtTheEnd(Instruction *I) { default: return false; case Intrinsic::memset: case Intrinsic::memcpy: + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: // Do shorten memory intrinsics. // FIXME: Add memmove if it's also safe to transform. - // TODO: Add atomic memcpy/memset return true; } } @@ -295,9 +296,7 @@ static bool isShortenableAtTheEnd(Instruction *I) { static bool isShortenableAtTheBeginning(Instruction *I) { // FIXME: Handle only memset for now. Supporting memcpy/memmove should be // easily done by offsetting the source address. - // TODO: Handle atomic memory intrinsics - IntrinsicInst *II = dyn_cast(I); - return II && II->getIntrinsicID() == Intrinsic::memset; + return isa(I); } /// Return the pointer that is being written to. @@ -897,7 +896,7 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, // Power of 2 vector writes are probably always a bad idea to optimize // as any store/memset/memcpy is likely using vector instructions so // shortening it to not vector size is likely to be slower - MemIntrinsic *EarlierIntrinsic = cast(EarlierWrite); + auto *EarlierIntrinsic = cast(EarlierWrite); unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment(); if (!IsOverwriteEnd) LaterOffset = int64_t(LaterOffset + LaterSize); @@ -906,15 +905,23 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) return false; + int64_t NewLength = IsOverwriteEnd + ? LaterOffset - EarlierOffset + : EarlierSize - (LaterOffset - EarlierOffset); + + if (auto *AMI = dyn_cast(EarlierWrite)) { + // When shortening an atomic memory intrinsic, the newly shortened + // length must remain an integer multiple of the element size. + const uint32_t ElementSize = AMI->getElementSizeInBytes(); + if (0 != NewLength % ElementSize) + return false; + } + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize << ")\n"); - int64_t NewLength = IsOverwriteEnd - ? LaterOffset - EarlierOffset - : EarlierSize - (LaterOffset - EarlierOffset); - Value *EarlierWriteLength = EarlierIntrinsic->getLength(); Value *TrimmedLength = ConstantInt::get(EarlierWriteLength->getType(), NewLength); diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll index 38e0cef1716c..ba0d46ad062b 100644 --- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll +++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll @@ -26,7 +26,8 @@ define void @write4to7_atomic(i32* nocapture %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -60,7 +61,8 @@ define void @write0to3_atomic(i32* nocapture %p) { ; CHECK-LABEL: @write0to3_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4 ; CHECK-NEXT: ret void ; @@ -76,7 +78,8 @@ define void @write0to3_atomic_weaker(i32* nocapture %p) { ; CHECK-LABEL: @write0to3_atomic_weaker( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 ; CHECK-NEXT: ret void ; @@ -111,7 +114,8 @@ define void @write0to7_atomic(i32* nocapture %p) { ; CHECK-LABEL: @write0to7_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 ; CHECK-NEXT: ret void @@ -149,7 +153,8 @@ define void @write0to7_2_atomic(i32* nocapture %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 ; CHECK-NEXT: ret void @@ -307,7 +312,8 @@ define void @write8To15AndThen0To7_atomic(i64* nocapture %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 @@ -333,7 +339,8 @@ define void @write8To15AndThen0To7_atomic_weaker(i64* nocapture %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 @@ -359,7 +366,8 @@ define void @write8To15AndThen0To7_atomic_weaker_2(i64* nocapture %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store i64 1, i64* [[BASE64_1]], align 8 diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll index ace06b467588..f3934263e343 100644 --- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll +++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll @@ -32,7 +32,7 @@ define void @write24to28_atomic(i32* nocapture %p) nounwind uwtable ssp { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -52,7 +52,7 @@ define void @write24to28_atomic_weaker(i32* nocapture %p) nounwind uwtable ssp { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 ; CHECK-NEXT: ret void @@ -87,7 +87,7 @@ define void @write28to32_atomic(i32* nocapture %p) nounwind uwtable ssp { ; CHECK-LABEL: @write28to32_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -155,7 +155,7 @@ define void @write32to36_atomic(%struct.vec2plusi* nocapture %p) nounwind uwtabl ; CHECK-LABEL: @write32to36_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 ; CHECK-NEXT: store atomic i32 1, i32* [[C]] unordered, align 4 ; CHECK-NEXT: ret void @@ -173,7 +173,7 @@ define void @write32to36_atomic_weaker(%struct.vec2plusi* nocapture %p) nounwind ; CHECK-LABEL: @write32to36_atomic_weaker( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 ; CHECK-NEXT: store i32 1, i32* [[C]], align 4 ; CHECK-NEXT: ret void @@ -207,7 +207,7 @@ define void @write16to32_atomic(%struct.vec2* nocapture %p) nounwind uwtable ssp ; CHECK-LABEL: @write16to32_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1 ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[C]], align 4 ; CHECK-NEXT: ret void @@ -316,7 +316,7 @@ define void @write16To23AndThen24To31_atomic(i64* nocapture %P, i64 %n64, i32 %n ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8 @@ -342,7 +342,7 @@ define void @write16To23AndThen24To31_atomic_weaker1(i64* nocapture %P, i64 %n64 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store i64 3, i64* [[BASE64_2]], align 8 @@ -368,7 +368,7 @@ define void @write16To23AndThen24To31_atomic_weaker2(i64* nocapture %P, i64 %n64 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8