Change CGCall to handle the "coerce" case where the coerce-to type

is a FCA to pass each of the elements as individual scalars. This produces code fast isel is less likely to reject and is easier on the optimizers. For example, before we would compile: struct DeclGroup { long NumDecls; char * Y; }; char * foo(DeclGroup D) { return D.NumDecls+D.Y; } to: %struct.DeclGroup = type { i64, i64 } define i64 @_Z3foo9DeclGroup(%struct.DeclGroup) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=3] store %struct.DeclGroup %0, %struct.DeclGroup* %D, align 1 %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; <i64*> [#uses=1] %tmp1 = load i64* %tmp ; <i64> [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; <i64*> [#uses=1] %tmp3 = load i64* %tmp2 ; <i64> [#uses=1] %add = add nsw i64 %tmp1, %tmp3 ; <i64> [#uses=1] ret i64 %add } Now we get: %0 = type { i64, i64 } %struct.DeclGroup = type { i64, i8* } define i8* @_Z3foo9DeclGroup(i64, i64) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=3] %2 = insertvalue %0 undef, i64 %0, 0 ; <%0> [#uses=1] %3 = insertvalue %0 %2, i64 %1, 1 ; <%0> [#uses=1] %4 = bitcast %struct.DeclGroup* %D to %0* ; <%0*> [#uses=1] store %0 %3, %0* %4, align 1 %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; <i64*> [#uses=1] %tmp1 = load i64* %tmp ; <i64> [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; <i8**> [#uses=1] %tmp3 = load i8** %tmp2 ; <i8*> [#uses=1] %add.ptr = getelementptr inbounds i8* %tmp3, i64 %tmp1 ; <i8*> [#uses=1] ret i8* %add.ptr } Elimination of the FCA inside the function is still-to-come. llvm-svn: 107099
2010-06-28 23:44:11 +00:00 · 2010-06-28 23:44:11 +00:00 · 3dd716c3c3
parent a37aa88e62
commit 3dd716c3c3
3 changed files with 65 additions and 16 deletions
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@ -240,7 +240,8 @@ const CGFunctionInfo &CodeGenTypes::getFunctionInfo(CanQualType ResTy,
    return *FI;

  // Construct the function info.
-  FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy, ArgTys);
+  FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy,
+                          ArgTys);
  FunctionInfos.InsertNode(FI, InsertPos);

  // Compute ABI information.
@ -259,6 +260,8 @@ CGFunctionInfo::CGFunctionInfo(unsigned _CallingConvention,
    NoReturn(_NoReturn), RegParm(_RegParm)
 {
  NumArgs = ArgTys.size();
+  
+  // FIXME: Coallocate with the CGFunctionInfo object.
  Args = new ArgInfo[1 + NumArgs];
  Args[0].type = ResTy;
  for (unsigned i = 0; i < NumArgs; ++i)
@ -593,9 +596,19 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI, bool IsVariadic) {
    case ABIArgInfo::Ignore:
      break;

-    case ABIArgInfo::Coerce:
-      ArgTys.push_back(AI.getCoerceToType());
+    case ABIArgInfo::Coerce: {
+      // If the coerce-to type is a first class aggregate, flatten it.  Either
+      // way is semantically identical, but fast-isel and the optimizer
+      // generally likes scalar values better than FCAs.
+      const llvm::Type *ArgTy = AI.getCoerceToType();
+      if (const llvm::StructType *STy = dyn_cast<llvm::StructType>(ArgTy)) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          ArgTys.push_back(STy->getElementType(i));
+      } else {
+        ArgTys.push_back(ArgTy);
+      }
      break;
+    }

    case ABIArgInfo::Indirect: {
      // indirect arguments are always on the stack, which is addr space #0.
@ -713,7 +726,12 @@ void CodeGenModule::ConstructAttributeList(const CGFunctionInfo &FI,

    switch (AI.getKind()) {
    case ABIArgInfo::Coerce:
-      break;
+      if (const llvm::StructType *STy =
+          dyn_cast<llvm::StructType>(AI.getCoerceToType()))
+        Index += STy->getNumElements();
+      else
+        ++Index;
+      continue;  // Skip index increment.

    case ABIArgInfo::Indirect:
      if (AI.getIndirectByVal())
@ -806,7 +824,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,

    switch (ArgI.getKind()) {
    case ABIArgInfo::Indirect: {
-      llvm::Value* V = AI;
+      llvm::Value *V = AI;
      if (hasAggregateLLVMType(Ty)) {
        // Do nothing, aggregates and complex variables are accessed by
        // reference.
@ -826,7 +844,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
    case ABIArgInfo::Extend:
    case ABIArgInfo::Direct: {
      assert(AI != Fn->arg_end() && "Argument mismatch!");
-      llvm::Value* V = AI;
+      llvm::Value *V = AI;
      if (hasAggregateLLVMType(Ty)) {
        // Create a temporary alloca to hold the argument; the rest of
        // codegen expects to access aggregates & complex values by
@ -876,12 +894,29 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
      continue;

    case ABIArgInfo::Coerce: {
-      assert(AI != Fn->arg_end() && "Argument mismatch!");
+      // If the coerce-to type is a first class aggregate, we flatten it and
+      // pass the elements. Either way is semantically identical, but fast-isel
+      // and the optimizer generally likes scalar values better than FCAs.
+      llvm::Value *FormalArg;
+      if (const llvm::StructType *STy =
+            dyn_cast<llvm::StructType>(ArgI.getCoerceToType())) {
+        // Reconstruct the FCA here.
+        // FIXME: If we have a direct match, do nice gep/store series.
+        FormalArg = llvm::UndefValue::get(STy);
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          assert(AI != Fn->arg_end() && "Argument mismatch!");
+          FormalArg = Builder.CreateInsertValue(FormalArg, AI++, i);
+        }
+      } else {
+        assert(AI != Fn->arg_end() && "Argument mismatch!");
+        FormalArg = AI++;
+      }
+      
      // FIXME: This is very wasteful; EmitParmDecl is just going to drop the
      // result in a new alloca anyway, so we could just store into that
      // directly if we broke the abstraction down more.
      llvm::Value *V = CreateMemTemp(Ty, "coerce");
-      CreateCoercedStore(AI, V, /*DestIsVolatile=*/false, *this);
+      CreateCoercedStore(FormalArg, V, /*DestIsVolatile=*/false, *this);
      // Match to what EmitParmDecl is expecting for this type.
      if (!CodeGenFunction::hasAggregateLLVMType(Ty)) {
        V = EmitLoadOfScalar(V, false, Ty);
@ -892,7 +927,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
        }
      }
      EmitParmDecl(*Arg, V);
-      break;
+      continue;  // Skip ++AI increment, already done.
    }
    }

@ -1080,8 +1115,22 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
        StoreComplexToAddr(RV.getComplexVal(), SrcPtr, false);
      } else
        SrcPtr = RV.getAggregateAddr();
-      Args.push_back(CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(),
-                                       *this));
+      
+      llvm::Value *SrcVal = 
+        CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), *this);
+      
+      // If the coerce-to type is a first class aggregate, we flatten it and
+      // pass the elements. Either way is semantically identical, but fast-isel
+      // and the optimizer generally likes scalar values better than FCAs.
+      if (const llvm::StructType *STy =
+            dyn_cast<llvm::StructType>(SrcVal->getType())) {
+        // Extract the elements of the value to pass in.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          Args.push_back(Builder.CreateExtractValue(SrcVal, i));
+      } else {
+        Args.push_back(SrcVal);
+      }
+      
      break;
    }

--- a/clang/test/CodeGen/x86_64-arguments.c
+++ b/clang/test/CodeGen/x86_64-arguments.c
@ -45,7 +45,7 @@ void f7(e7 a0) {
 // Test merging/passing of upper eightbyte with X87 class.
 //
 // CHECK: define %0 @f8_1()
-// CHECK: define void @f8_2(%0)
+// CHECK: define void @f8_2(i64, double)
 union u8 {
  long double a;
  int b;
--- a/clang/test/CodeGenCXX/x86_64-arguments.cpp
+++ b/clang/test/CodeGenCXX/x86_64-arguments.cpp
@ -6,19 +6,19 @@
 // Basic base class test.
 struct f0_s0 { unsigned a; };
 struct f0_s1 : public f0_s0 { void *b; };
-// CHECK: define void @_Z2f05f0_s1([[i64_i64_ty]])
+// CHECK: define void @_Z2f05f0_s1(i64, i64)
 void f0(f0_s1 a0) { }

 // Check with two eight-bytes in base class.
 struct f1_s0 { unsigned a; unsigned b; float c; };
 struct f1_s1 : public f1_s0 { float d;};
-// CHECK: define void @_Z2f15f1_s1([[i64_double_ty]])
+// CHECK: define void @_Z2f15f1_s1(i64, double)
 void f1(f1_s1 a0) { }

 // Check with two eight-bytes in base class and merge.
 struct f2_s0 { unsigned a; unsigned b; float c; };
 struct f2_s1 : public f2_s0 { char d;};
-// CHECK: define void @_Z2f25f2_s1([[i64_i64_ty]])
+// CHECK: define void @_Z2f25f2_s1(i64, i64)
 void f2(f2_s1 a0) { }

 // PR5831
@ -27,7 +27,7 @@ struct s3_1 { struct s3_0 a; long b; };
 void f3(struct s3_1 x) {}

 // CHECK: define i64 @_Z4f4_0M2s4i(i64)
-// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE([[i64_i64_ty]])
+// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE(i64, i64)
 struct s4 {};
 typedef int s4::* s4_mdp;
 typedef int (s4::*s4_mfp)();