[InstCombineCalls] Unfold element atomic memcpy instruction

Differential Revision: https://reviews.llvm.org/D28909 llvm-svn: 294453
2017-02-08 14:32:04 +00:00 · 2017-02-08 14:32:04 +00:00 · 900ffa34c8
parent 4b317fa24e
commit 900ffa34c8
3 changed files with 175 additions and 0 deletions
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -60,6 +60,12 @@ using namespace PatternMatch;

 STATISTIC(NumSimplified, "Number of library calls simplified");

+static cl::opt<uint64_t> UnfoldElementAtomicMemcpyMaxElements(
+    "unfold-element-atomic-memcpy-max-elements",
+    cl::init(16),
+    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+             "allowed to unfold"));
+
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
 static Type *getPromotedType(Type *Ty) {
@ -108,6 +114,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
  return ConstantVector::get(BoolVec);
 }

+Instruction *
+InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+  // Try to unfold this intrinsic into sequence of explicit atomic loads and
+  // stores.
+  // First check that number of elements is compile time constant.
+  auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
+  if (!NumElementsCI)
+    return nullptr;
+
+  // Check that there are not too many elements.
+  uint64_t NumElements = NumElementsCI->getZExtValue();
+  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+    return nullptr;
+
+  // Don't unfold into illegal integers
+  uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
+  if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
+    return nullptr;
+
+  // Cast source and destination to the correct type. Intrinsic input arguments
+  // are usually represented as i8*.
+  // Often operands will be explicitly casted to i8* and we can just strip
+  // those casts instead of inserting new ones. However it's easier to rely on
+  // other InstCombine rules which will cover trivial cases anyway.
+  Value *Src = AMI->getRawSource();
+  Value *Dst = AMI->getRawDest();
+  Type *ElementPointerType = Type::getIntNPtrTy(
+      AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
+
+  Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+                                                "memcpy_unfold.src_casted");
+  Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+                                                "memcpy_unfold.dst_casted");
+
+  for (uint64_t i = 0; i < NumElements; ++i) {
+    // Get current element addresses
+    ConstantInt *ElementIdxCI =
+        ConstantInt::get(AMI->getContext(), APInt(64, i));
+    Value *SrcElementAddr =
+        Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+    Value *DstElementAddr =
+        Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+    // Load from the source. Transfer alignment information and mark load as
+    // unordered atomic.
+    LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+    Load->setOrdering(AtomicOrdering::Unordered);
+    // We know alignment of the first element. It is also guaranteed by the
+    // verifier that element size is less or equal than first element alignment
+    // and both of this values are powers of two.
+    // This means that all subsequent accesses are at least element size
+    // aligned.
+    // TODO: We can infer better alignment but there is no evidence that this
+    // will matter.
+    Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
+                              : AMI->getElementSizeInBytes());
+    Load->setDebugLoc(AMI->getDebugLoc());
+
+    // Store loaded value via unordered atomic store.
+    StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+    Store->setOrdering(AtomicOrdering::Unordered);
+    Store->setAlignment(i == 0 ? AMI->getDstAlignment()
+                               : AMI->getElementSizeInBytes());
+    Store->setDebugLoc(AMI->getDebugLoc());
+  }
+
+  // Set the number of elements of the copy to 0, it will be deleted on the
+  // next iteration.
+  AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+  return AMI;
+}
+
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@ -1839,6 +1917,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
      if (C->isNullValue())
        return eraseInstFromFunction(*AMI);
+
+    if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+      return I;
  }

  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@ -650,6 +650,8 @@ private:
  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
  Instruction *MatchBSwap(BinaryOperator &I);
  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+  Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
  Instruction *SimplifyMemSet(MemSetInst *MI);

--- a/llvm/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
+++ b/llvm/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
@ -0,0 +1,92 @@
+; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test basic unfolding
+define void @test1(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test1
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
+; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
+
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL3:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL4:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
+  ret void
+}
+
+; Test that we don't unfold too much
+define void @test2(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test2
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
+  ret void
+}
+
+; Test that we will not unfold into non native integers
+define void @test3(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test3
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
+  ret void
+}
+
+; Test that we will eliminate redundant bitcasts
+define void @test4(i64* %Src, i64* %Dst) {
+; CHECK-LABEL: test4
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-NOT: bitcast
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i64, i64* %Src unordered, align 16
+; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
+
+; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
+; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
+; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
+; CHECK-DAG: [[VAL3:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
+; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
+; CHECK-DAG: [[VAL4:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
+entry:
+  %Src.casted = bitcast i64* %Src to i8*
+  %Dst.casted = bitcast i64* %Dst to i8*
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
+  ret void
+}
+
+define void @test5(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test5
+
+; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+  ret void
+}
+
+declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)