diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 9719dfa43289..eb517edd816f 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -240,7 +240,8 @@ const CGFunctionInfo &CodeGenTypes::getFunctionInfo(CanQualType ResTy,
     return *FI;
 
   // Construct the function info.
-  FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy, ArgTys);
+  FI = new CGFunctionInfo(CC, Info.getNoReturn(), Info.getRegParm(), ResTy,
+                          ArgTys);
   FunctionInfos.InsertNode(FI, InsertPos);
 
   // Compute ABI information.
@@ -259,6 +260,8 @@ CGFunctionInfo::CGFunctionInfo(unsigned _CallingConvention,
     NoReturn(_NoReturn), RegParm(_RegParm)
 {
   NumArgs = ArgTys.size();
+  
+  // FIXME: Coallocate with the CGFunctionInfo object.
   Args = new ArgInfo[1 + NumArgs];
   Args[0].type = ResTy;
   for (unsigned i = 0; i < NumArgs; ++i)
@@ -593,9 +596,19 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI, bool IsVariadic) {
     case ABIArgInfo::Ignore:
       break;
 
-    case ABIArgInfo::Coerce:
-      ArgTys.push_back(AI.getCoerceToType());
+    case ABIArgInfo::Coerce: {
+      // If the coerce-to type is a first class aggregate, flatten it.  Either
+      // way is semantically identical, but fast-isel and the optimizer
+      // generally likes scalar values better than FCAs.
+      const llvm::Type *ArgTy = AI.getCoerceToType();
+      if (const llvm::StructType *STy = dyn_cast<llvm::StructType>(ArgTy)) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          ArgTys.push_back(STy->getElementType(i));
+      } else {
+        ArgTys.push_back(ArgTy);
+      }
       break;
+    }
 
     case ABIArgInfo::Indirect: {
       // indirect arguments are always on the stack, which is addr space #0.
@@ -713,7 +726,12 @@ void CodeGenModule::ConstructAttributeList(const CGFunctionInfo &FI,
 
     switch (AI.getKind()) {
     case ABIArgInfo::Coerce:
-      break;
+      if (const llvm::StructType *STy =
+          dyn_cast<llvm::StructType>(AI.getCoerceToType()))
+        Index += STy->getNumElements();
+      else
+        ++Index;
+      continue;  // Skip index increment.
 
     case ABIArgInfo::Indirect:
       if (AI.getIndirectByVal())
@@ -806,7 +824,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
 
     switch (ArgI.getKind()) {
     case ABIArgInfo::Indirect: {
-      llvm::Value* V = AI;
+      llvm::Value *V = AI;
       if (hasAggregateLLVMType(Ty)) {
         // Do nothing, aggregates and complex variables are accessed by
         // reference.
@@ -826,7 +844,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
     case ABIArgInfo::Extend:
     case ABIArgInfo::Direct: {
       assert(AI != Fn->arg_end() && "Argument mismatch!");
-      llvm::Value* V = AI;
+      llvm::Value *V = AI;
       if (hasAggregateLLVMType(Ty)) {
         // Create a temporary alloca to hold the argument; the rest of
         // codegen expects to access aggregates & complex values by
@@ -876,12 +894,29 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
       continue;
 
     case ABIArgInfo::Coerce: {
-      assert(AI != Fn->arg_end() && "Argument mismatch!");
+      // If the coerce-to type is a first class aggregate, we flatten it and
+      // pass the elements. Either way is semantically identical, but fast-isel
+      // and the optimizer generally likes scalar values better than FCAs.
+      llvm::Value *FormalArg;
+      if (const llvm::StructType *STy =
+            dyn_cast<llvm::StructType>(ArgI.getCoerceToType())) {
+        // Reconstruct the FCA here.
+        // FIXME: If we have a direct match, do nice gep/store series.
+        FormalArg = llvm::UndefValue::get(STy);
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          assert(AI != Fn->arg_end() && "Argument mismatch!");
+          FormalArg = Builder.CreateInsertValue(FormalArg, AI++, i);
+        }
+      } else {
+        assert(AI != Fn->arg_end() && "Argument mismatch!");
+        FormalArg = AI++;
+      }
+      
       // FIXME: This is very wasteful; EmitParmDecl is just going to drop the
       // result in a new alloca anyway, so we could just store into that
       // directly if we broke the abstraction down more.
       llvm::Value *V = CreateMemTemp(Ty, "coerce");
-      CreateCoercedStore(AI, V, /*DestIsVolatile=*/false, *this);
+      CreateCoercedStore(FormalArg, V, /*DestIsVolatile=*/false, *this);
       // Match to what EmitParmDecl is expecting for this type.
       if (!CodeGenFunction::hasAggregateLLVMType(Ty)) {
         V = EmitLoadOfScalar(V, false, Ty);
@@ -892,7 +927,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         }
       }
       EmitParmDecl(*Arg, V);
-      break;
+      continue;  // Skip ++AI increment, already done.
     }
     }
 
@@ -1080,8 +1115,22 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         StoreComplexToAddr(RV.getComplexVal(), SrcPtr, false);
       } else
         SrcPtr = RV.getAggregateAddr();
-      Args.push_back(CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(),
-                                       *this));
+      
+      llvm::Value *SrcVal = 
+        CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), *this);
+      
+      // If the coerce-to type is a first class aggregate, we flatten it and
+      // pass the elements. Either way is semantically identical, but fast-isel
+      // and the optimizer generally likes scalar values better than FCAs.
+      if (const llvm::StructType *STy =
+            dyn_cast<llvm::StructType>(SrcVal->getType())) {
+        // Extract the elements of the value to pass in.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          Args.push_back(Builder.CreateExtractValue(SrcVal, i));
+      } else {
+        Args.push_back(SrcVal);
+      }
+      
       break;
     }
 
diff --git a/clang/test/CodeGen/x86_64-arguments.c b/clang/test/CodeGen/x86_64-arguments.c
index 242239010881..2a96f22e1dc9 100644
--- a/clang/test/CodeGen/x86_64-arguments.c
+++ b/clang/test/CodeGen/x86_64-arguments.c
@@ -45,7 +45,7 @@ void f7(e7 a0) {
 // Test merging/passing of upper eightbyte with X87 class.
 //
 // CHECK: define %0 @f8_1()
-// CHECK: define void @f8_2(%0)
+// CHECK: define void @f8_2(i64, double)
 union u8 {
   long double a;
   int b;
diff --git a/clang/test/CodeGenCXX/x86_64-arguments.cpp b/clang/test/CodeGenCXX/x86_64-arguments.cpp
index 4bc83b85134e..bc2f312aea14 100644
--- a/clang/test/CodeGenCXX/x86_64-arguments.cpp
+++ b/clang/test/CodeGenCXX/x86_64-arguments.cpp
@@ -6,19 +6,19 @@
 // Basic base class test.
 struct f0_s0 { unsigned a; };
 struct f0_s1 : public f0_s0 { void *b; };
-// CHECK: define void @_Z2f05f0_s1([[i64_i64_ty]])
+// CHECK: define void @_Z2f05f0_s1(i64, i64)
 void f0(f0_s1 a0) { }
 
 // Check with two eight-bytes in base class.
 struct f1_s0 { unsigned a; unsigned b; float c; };
 struct f1_s1 : public f1_s0 { float d;};
-// CHECK: define void @_Z2f15f1_s1([[i64_double_ty]])
+// CHECK: define void @_Z2f15f1_s1(i64, double)
 void f1(f1_s1 a0) { }
 
 // Check with two eight-bytes in base class and merge.
 struct f2_s0 { unsigned a; unsigned b; float c; };
 struct f2_s1 : public f2_s0 { char d;};
-// CHECK: define void @_Z2f25f2_s1([[i64_i64_ty]])
+// CHECK: define void @_Z2f25f2_s1(i64, i64)
 void f2(f2_s1 a0) { }
 
 // PR5831
@@ -27,7 +27,7 @@ struct s3_1 { struct s3_0 a; long b; };
 void f3(struct s3_1 x) {}
 
 // CHECK: define i64 @_Z4f4_0M2s4i(i64)
-// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE([[i64_i64_ty]])
+// CHECK: define [[i64_i64_ty]] @_Z4f4_1M2s4FivE(i64, i64)
 struct s4 {};
 typedef int s4::* s4_mdp;
 typedef int (s4::*s4_mfp)();