Revert "[memcpyopt] Teach memcpyopt to optimize across basic blocks"

This reverts r321138. It seems there are still underlying issues with
memdep. PR35519 seems to still be present if debug info is enabled. We
end up losing a memcpy. Somehow during store to memset merging, we
insert the memset after the memcpy or fail to update the memdep analysis
to account for the newly inserted memset of a pair.

Reduced test case:

  #include <assert.h>
  #include <stdio.h>
  #include <string>
  #include <utility>
  #include <vector>

  void do_push_back(
      std::vector<std::pair<std::string, std::vector<std::string>>>* crls) {
    crls->push_back(std::make_pair(std::string(), std::vector<std::string>()));
  }

  int __attribute__((optnone)) main() {
    // Put some data in the vector and then remove it so we take the push_back
    // fast path.
    std::vector<std::pair<std::string, std::vector<std::string>>> crl_set;
    crl_set.push_back({"asdf", {}});
    crl_set.pop_back();
    printf("first word in vector storage: %p\n", *(void**)crl_set.data());

    // Do the push_back which may fail to initialize the data.
    do_push_back(&crl_set);
    auto* first = &crl_set.back().first;
    printf("first word in vector storage (should be zero): %p\n",
           *(void**)crl_set.data());
    assert(first->empty());
    puts("ok");
  }

Compile with libc++, enable optimizations, and enable debug info:
$ clang++ -stdlib=libc++ -g -O2 t.cpp -o t.exe -Wl,-rpath=llvm/build/lib

This program will assert with this change.

llvm-svn: 321510
This commit is contained in:
Reid Kleckner 2017-12-28 05:10:33 +00:00
parent 4371e049d4
commit 6d31001cd6
7 changed files with 19 additions and 306 deletions

View File

@ -407,12 +407,6 @@ public:
void getNonLocalPointerDependency(Instruction *QueryInst,
SmallVectorImpl<NonLocalDepResult> &Result);
/// Perform a dependency query specifically for QueryInst's access to Loc.
/// The other comments for getNonLocalPointerDependency apply here as well.
void getNonLocalPointerDependencyFrom(Instruction *QueryInst,
const MemoryLocation &Loc, bool isLoad,
SmallVectorImpl<NonLocalDepResult> &Result);
/// Removes an instruction from the dependence analysis, updating the
/// dependence of instructions that previously depended on it.
void removeInstruction(Instruction *InstToRemove);

View File

@ -920,14 +920,6 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
Instruction *QueryInst, SmallVectorImpl<NonLocalDepResult> &Result) {
const MemoryLocation Loc = MemoryLocation::get(QueryInst);
bool isLoad = isa<LoadInst>(QueryInst);
return getNonLocalPointerDependencyFrom(QueryInst, Loc, isLoad, Result);
}
void MemoryDependenceResults::getNonLocalPointerDependencyFrom(
Instruction *QueryInst,
const MemoryLocation &Loc,
bool isLoad,
SmallVectorImpl<NonLocalDepResult> &Result) {
BasicBlock *FromBB = QueryInst->getParent();
assert(FromBB);
@ -1127,15 +1119,21 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
// If we already have a cache entry for this CacheKey, we may need to do some
// work to reconcile the cache entry and the current query.
if (!Pair.second) {
if (CacheInfo->Size != Loc.Size) {
// The query's Size differs from the cached one. Throw out the
// cached data and proceed with the query at the new size.
if (CacheInfo->Size < Loc.Size) {
// The query's Size is greater than the cached one. Throw out the
// cached data and proceed with the query at the greater size.
CacheInfo->Pair = BBSkipFirstBlockPair();
CacheInfo->Size = Loc.Size;
for (auto &Entry : CacheInfo->NonLocalDeps)
if (Instruction *Inst = Entry.getResult().getInst())
RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
CacheInfo->NonLocalDeps.clear();
} else if (CacheInfo->Size > Loc.Size) {
// This query's Size is less than the cached one. Conservatively restart
// the query using the greater size.
return getNonLocalPointerDepFromBB(
QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad,
StartBB, Result, Visited, SkipFirstBlock);
}
// If the query's AATags are inconsistent with the cached one,

View File

@ -476,33 +476,22 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
Alignment = DL.getABITypeAlignment(EltType);
}
// Remember the debug location.
DebugLoc Loc;
if (!Range.TheStores.empty())
Loc = Range.TheStores[0]->getDebugLoc();
AMemSet =
Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
DEBUG(dbgs() << "Replace stores:\n";
for (Instruction *SI : Range.TheStores)
dbgs() << *SI << '\n');
dbgs() << *SI << '\n';
dbgs() << "With: " << *AMemSet << '\n');
if (!Range.TheStores.empty())
AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
// Zap all the stores.
for (Instruction *SI : Range.TheStores) {
MD->removeInstruction(SI);
SI->eraseFromParent();
}
// Create the memset after removing the stores, so that if there any cached
// non-local dependencies on the removed instructions in
// MemoryDependenceAnalysis, the cache entries are updated to "dirty"
// entries pointing below the memset, so subsequent queries include the
// memset.
AMemSet =
Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
if (!Range.TheStores.empty())
AMemSet->setDebugLoc(Loc);
DEBUG(dbgs() << "With: " << *AMemSet << '\n');
++NumMemSetInfer;
}
@ -1042,22 +1031,9 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
//
// NOTE: This is conservative, it will stop on any read from the source loc,
// not just the defining memcpy.
MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep);
MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false,
M->getIterator(), M->getParent());
if (SourceDep.isNonLocal()) {
SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false,
NonLocalDepResults);
if (NonLocalDepResults.size() == 1) {
SourceDep = NonLocalDepResults[0].getResult();
assert((!SourceDep.getInst() ||
LookupDomTree().dominates(SourceDep.getInst(), M)) &&
"when memdep returns exactly one result, it should dominate");
}
}
MemDepResult SourceDep =
MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
M->getIterator(), M->getParent());
if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
return false;
@ -1259,18 +1235,6 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
SrcLoc, true, M->getIterator(), M->getParent());
if (SrcDepInfo.isNonLocal()) {
SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true,
NonLocalDepResults);
if (NonLocalDepResults.size() == 1) {
SrcDepInfo = NonLocalDepResults[0].getResult();
assert((!SrcDepInfo.getInst() ||
LookupDomTree().dominates(SrcDepInfo.getInst(), M)) &&
"when memdep returns exactly one result, it should dominate");
}
}
if (SrcDepInfo.isClobber()) {
if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
return processMemCpyMemCpyDependence(M, MDep);

View File

@ -1,48 +0,0 @@
; RUN: opt < %s -memcpyopt -S | FileCheck %s
; Test memcpy-memcpy dependencies across invoke edges.
; Test that memcpyopt works across the non-unwind edge of an invoke.
define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
entry:
%temp = alloca i8, i32 64
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
invoke void @invoke_me()
to label %try.cont unwind label %lpad
lpad:
landingpad { i8*, i32 }
catch i8* null
ret void
try.cont:
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
ret void
}
; Test that memcpyopt works across the unwind edge of an invoke.
define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
entry:
%temp = alloca i8, i32 64
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
invoke void @invoke_me()
to label %try.cont unwind label %lpad
lpad:
landingpad { i8*, i32 }
catch i8* null
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
ret void
try.cont:
ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
declare i32 @__gxx_personality_v0(...)
declare void @invoke_me() readnone

View File

@ -1,45 +0,0 @@
; RUN: opt < %s -memcpyopt -S | FileCheck %s
; Update cached non-local dependence information when merging stores into memset.
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Don't delete the memcpy in %if.then, even though it depends on an instruction
; which will be deleted.
; CHECK-LABEL: @foo
define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
entry:
%tmp = alloca [50 x i8], align 8
%tmp4 = bitcast [50 x i8]* %tmp to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp4, i64 1
call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
store i8 0, i8* %tmp4, align 8, !dbg !5
; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %tmp1, i8* nonnull %d, i64 10, i32 1, i1 false)
br i1 %c, label %if.then, label %exit
if.then:
; CHECK: if.then:
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
br label %exit
exit:
ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "t.rs", directory: "/tmp")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !DILocation(line: 8, column: 5, scope: !6)
!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!7 = !DISubroutineType(types: !8)
!8 = !{null}

View File

@ -1,36 +0,0 @@
; RUN: opt < %s -memcpyopt -S | FileCheck %s
; Handle memcpy-memcpy dependencies of differing sizes correctly.
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Don't delete the second memcpy, even though there's an earlier
; memcpy with a larger size from the same address.
; CHECK-LABEL: @foo
define i32 @foo(i1 %z) {
entry:
%a = alloca [10 x i32]
%s = alloca [10 x i32]
%0 = bitcast [10 x i32]* %a to i8*
%1 = bitcast [10 x i32]* %s to i8*
call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 0, i64 40, i32 16, i1 false)
%arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 0
store i32 1, i32* %arrayidx
%scevgep = getelementptr [10 x i32], [10 x i32]* %s, i64 0, i64 1
%scevgep7 = bitcast i32* %scevgep to i8*
br i1 %z, label %for.body3.lr.ph, label %for.inc7.1
for.body3.lr.ph: ; preds = %entry
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 17179869180, i32 4, i1 false)
br label %for.inc7.1
for.inc7.1:
; CHECK: for.inc7.1:
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
%2 = load i32, i32* %arrayidx
ret i32 %2
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)

View File

@ -1,114 +0,0 @@
; RUN: opt < %s -memcpyopt -S | FileCheck %s
; Make sure memcpy-memcpy dependence is optimized across
; basic blocks (conditional branches and invokes).
%struct.s = type { i32, i32 }
@s_foo = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
@s_baz = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
@i = external constant i8*
declare void @qux()
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
declare void @__cxa_throw(i8*, i8*, i8*)
declare i32 @__gxx_personality_v0(...)
declare i8* @__cxa_begin_catch(i8*)
; A simple partial redundancy. Test that the second memcpy is optimized
; to copy directly from the original source rather than from the temporary.
; CHECK-LABEL: @wobble
define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
bb:
%temp = alloca i8, i32 64
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
br i1 %some_condition, label %more, label %out
out:
call void @qux()
unreachable
more:
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
ret void
}
; A CFG triangle with a partial redundancy targeting an alloca. Test that the
; memcpy inside the triangle is optimized to copy directly from the original
; source rather than from the temporary.
; CHECK-LABEL: @foo
define i32 @foo(i1 %t3) {
bb:
%s = alloca %struct.s, align 4
%t = alloca %struct.s, align 4
%s1 = bitcast %struct.s* %s to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
br i1 %t3, label %bb4, label %bb7
bb4: ; preds = %bb
%t5 = bitcast %struct.s* %t to i8*
%s6 = bitcast %struct.s* %s to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* %s6, i64 8, i32 4, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
br label %bb7
bb7: ; preds = %bb4, %bb
%t8 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 0
%t9 = load i32, i32* %t8, align 4
%t10 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 1
%t11 = load i32, i32* %t10, align 4
%t12 = add i32 %t9, %t11
ret i32 %t12
}
; A CFG diamond with an invoke on one side, and a partially redundant memcpy
; into an alloca on the other. Test that the memcpy inside the diamond is
; optimized to copy ; directly from the original source rather than from the
; temporary. This more complex test represents a relatively common usage
; pattern.
; CHECK-LABEL: @baz
define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
bb:
%s = alloca %struct.s, align 4
%t = alloca %struct.s, align 4
%s3 = bitcast %struct.s* %s to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
br i1 %t5, label %bb6, label %bb22
bb6: ; preds = %bb
invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
to label %bb25 unwind label %bb9
bb9: ; preds = %bb6
%t10 = landingpad { i8*, i32 }
catch i8* null
br label %bb13
bb13: ; preds = %bb9
%t15 = call i8* @__cxa_begin_catch(i8* null)
br label %bb23
bb22: ; preds = %bb
%t23 = bitcast %struct.s* %t to i8*
%s24 = bitcast %struct.s* %s to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* %s24, i64 8, i32 4, i1 false)
; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
br label %bb23
bb23: ; preds = %bb22, %bb13
%t17 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 0
%t18 = load i32, i32* %t17, align 4
%t19 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 1
%t20 = load i32, i32* %t19, align 4
%t21 = add nsw i32 %t18, %t20
ret i32 %t21
bb25: ; preds = %bb6
unreachable
}