diff --git a/clang/include/clang/AST/X86Builtins.def b/clang/include/clang/AST/X86Builtins.def index 27ebbbc13afe..95d000346378 100644 --- a/clang/include/clang/AST/X86Builtins.def +++ b/clang/include/clang/AST/X86Builtins.def @@ -175,8 +175,6 @@ BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "") BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "") BUILTIN(__builtin_ia32_loadups, "V4ffC*", "") BUILTIN(__builtin_ia32_storeups, "vf*V4f", "") -BUILTIN(__builtin_ia32_loadhps, "V4fV4fV2i*", "") -BUILTIN(__builtin_ia32_loadlps, "V4fV4fV2i*", "") BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "") BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "") BUILTIN(__builtin_ia32_movmskps, "iV4f", "") @@ -194,8 +192,6 @@ BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "") BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "") BUILTIN(__builtin_ia32_loadupd, "V2ddC*", "") BUILTIN(__builtin_ia32_storeupd, "vd*V2d", "") -BUILTIN(__builtin_ia32_loadhpd, "V2dV2ddC*", "") -BUILTIN(__builtin_ia32_loadlpd, "V2dV2ddC*", "") BUILTIN(__builtin_ia32_movmskpd, "iV2d", "") BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "") BUILTIN(__builtin_ia32_movnti, "vi*i", "") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 8a565985f53f..3c7c5e539879 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -764,28 +764,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse2_cmp_sd); return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), "cmpsd"); } - case X86::BI__builtin_ia32_loadlps: - case X86::BI__builtin_ia32_loadhps: { - // FIXME: This should probably be represented as - // shuffle (dst, (v4f32 (insert undef, (load i64), 0)), shuf mask hi/lo) - const llvm::Type *EltTy = llvm::Type::DoubleTy; - const llvm::Type *VecTy = llvm::VectorType::get(EltTy, 2); - const llvm::Type *OrigTy = Ops[0]->getType(); - unsigned Index = BuiltinID == X86::BI__builtin_ia32_loadlps ? 0 : 1; - llvm::Value *Idx = llvm::ConstantInt::get(llvm::Type::Int32Ty, Index); - Ops[1] = Builder.CreateBitCast(Ops[1], llvm::PointerType::getUnqual(EltTy)); - Ops[1] = Builder.CreateLoad(Ops[1], "tmp"); - Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Ops[0] = Builder.CreateInsertElement(Ops[0], Ops[1], Idx, "loadps"); - return Builder.CreateBitCast(Ops[0], OrigTy, "loadps"); - } - case X86::BI__builtin_ia32_loadlpd: - case X86::BI__builtin_ia32_loadhpd: { - Ops[1] = Builder.CreateLoad(Ops[1], "tmp"); - unsigned Index = BuiltinID == X86::BI__builtin_ia32_loadlpd ? 0 : 1; - llvm::Value *Idx = llvm::ConstantInt::get(llvm::Type::Int32Ty, Index); - return Builder.CreateInsertElement(Ops[0], Ops[1], Idx, "loadpd"); - } case X86::BI__builtin_ia32_storehps: case X86::BI__builtin_ia32_storelps: { const llvm::Type *EltTy = llvm::Type::Int64Ty; diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 7291f88979e4..2903049170c2 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -464,20 +464,19 @@ _mm_cvtss_f32(__m128 a) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadh_pi(__m128 a, __m64 const *p) { - return __builtin_ia32_loadhps(a, (__v2si *)p); + __m128 b; + b[0] = *(float*)p; + b[1] = *((float*)p+1); + return __builtin_shufflevector(a, b, 0, 1, 4, 5); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadl_pi(__m128 a, __m64 const *p) { -#if 0 - // FIXME: This should work, but gives really crappy code at the moment __m128 b; b[0] = *(float*)p; b[1] = *((float*)p+1); - return __builtin_shufflevector(a, b, 0, 1, 4, 5); -#endif - return __builtin_ia32_loadlps(a, (__v2si *)p); + return __builtin_shufflevector(a, b, 4, 5, 2, 3); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -899,8 +898,6 @@ do { \ (row3) = _mm_movelh_ps(tmp3, tmp1); \ } while (0) -#include - #endif /* __SSE__ */ #endif /* __XMMINTRIN_H */ diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c index c889a2feab8e..8d4bcbfab17b 100644 --- a/clang/test/CodeGen/builtins-x86.c +++ b/clang/test/CodeGen/builtins-x86.c @@ -250,8 +250,6 @@ void f0() { (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); tmp_V4f = __builtin_ia32_loadups(tmp_fCp); (void) __builtin_ia32_storeups(tmp_fp, tmp_V4f); - tmp_V4f = __builtin_ia32_loadhps(tmp_V4f, tmp_V2ip); - tmp_V4f = __builtin_ia32_loadlps(tmp_V4f, tmp_V2ip); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); @@ -270,8 +268,6 @@ void f0() { (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); tmp_V2d = __builtin_ia32_loadupd(tmp_dCp); (void) __builtin_ia32_storeupd(tmp_dp, tmp_V2d); - tmp_V2d = __builtin_ia32_loadhpd(tmp_V2d, tmp_dCp); - tmp_V2d = __builtin_ia32_loadlpd(tmp_V2d, tmp_dCp); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); (void) __builtin_ia32_movnti(tmp_ip, tmp_i);