diff --git a/llvm/test/CodeGen/X86/vec_splat-2.ll b/llvm/test/CodeGen/X86/vec_splat-2.ll index f105de4d977d..5c668b7e5a5b 100644 --- a/llvm/test/CodeGen/X86/vec_splat-2.ll +++ b/llvm/test/CodeGen/X86/vec_splat-2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd | count 1 +; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s define void @test(<2 x i64>* %P, i8 %x) nounwind { %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1] @@ -23,4 +23,11 @@ define void @test(<2 x i64>* %P, i8 %x) nounwind { %tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64> ; <<2 x i64>> [#uses=1] store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P ret void + +; CHECK: test: +; CHECK-NOT: pshufd +; CHECK: punpcklbw +; CHECK: punpcklbw +; CHECK: pshufd $0 +; CHECK-NOT: pshufd } diff --git a/llvm/test/CodeGen/X86/vec_splat-3.ll b/llvm/test/CodeGen/X86/vec_splat-3.ll index feacc42406df..293ed488668b 100644 --- a/llvm/test/CodeGen/X86/vec_splat-3.ll +++ b/llvm/test/CodeGen/X86/vec_splat-3.ll @@ -1,55 +1,240 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t -; RUN: grep punpcklwd %t | count 4 -; RUN: grep punpckhwd %t | count 4 -; RUN: grep "pshufd" %t | count 8 +; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s ; Splat test for v8i16 ; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used twice) define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_0: +; CHECK: punpcklwd +; CHECK-NEXT: pshufd $0 } define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_1: +; CHECK: punpcklwd +; CHECK-NEXT: pshufd $85 } define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_2: +; CHECK: punpcklwd +; CHECK-NEXT: pshufd $-86 } define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_3: +; CHECK: punpcklwd +; CHECK-NEXT: pshufd $-1 } define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_4: +; CHECK: punpckhwd +; CHECK-NEXT: pshufd $0 } define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_5: +; CHECK: punpckhwd +; CHECK-NEXT: pshufd $85 } define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_6: +; CHECK: punpckhwd +; CHECK-NEXT: pshufd $-86 } define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 + +; CHECK: shuf_8i16_7: +; CHECK: punpckhwd +; CHECK-NEXT: pshufd $-1 +} + +; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used 4 times) + +; Splat test for v16i8 +define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_8: +; CHECK: punpcklbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $0 +} + +define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_9: +; CHECK: punpcklbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $85 +} + +define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_10: +; CHECK: punpcklbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $-86 +} + +define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_11: +; CHECK: punpcklbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $-1 +} + + +define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_12: +; CHECK: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $0 +} + +define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_13: +; CHECK: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $85 +} + +define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_14: +; CHECK: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $-86 +} + +define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_15: +; CHECK: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $-1 +} + +define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_16: +; CHECK: punpckhbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $0 +} + +define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_17: +; CHECK: punpckhbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $85 +} + +define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_18: +; CHECK: punpckhbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $-86 +} + +define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_19: +; CHECK: punpckhbw +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: pshufd $-1 +} + +define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_20: +; CHECK: punpckhbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $0 +} + +define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_21: +; CHECK: punpckhbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $85 +} + +define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_22: +; CHECK: punpckhbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $-86 +} + +define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { + %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> + ret <16 x i8> %tmp6 + +; CHECK: shuf_16i8_23: +; CHECK: punpckhbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: pshufd $-1 } diff --git a/llvm/test/CodeGen/X86/vec_splat-4.ll b/llvm/test/CodeGen/X86/vec_splat-4.ll deleted file mode 100644 index 374acfa4e094..000000000000 --- a/llvm/test/CodeGen/X86/vec_splat-4.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t -; RUN: grep punpcklbw %t | count 16 -; RUN: grep punpckhbw %t | count 16 -; RUN: grep "pshufd" %t | count 16 - -; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used 4 times) - -; Splat test for v16i8 -define <16 x i8 > @shuf_16i8_0(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0 , i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_1(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_2(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2 , i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_3(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3 , i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3 > - ret <16 x i8 > %tmp6 -} - - -define <16 x i8 > @shuf_16i8_4(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_5(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5 , i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_6(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6 , i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_7(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef , i32 undef, i32 undef, i32 undef , i32 undef > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_8(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8 , i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_9(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9 , i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_10(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10 , i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_11(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11 , i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_12(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12 , i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_13(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13 , i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_14(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14 , i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14 > - ret <16 x i8 > %tmp6 -} - -define <16 x i8 > @shuf_16i8_15(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone { -entry: - %tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15 , i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15 > - ret <16 x i8 > %tmp6 -} diff --git a/llvm/test/CodeGen/X86/vec_splat.ll b/llvm/test/CodeGen/X86/vec_splat.ll index 24d8487f17bd..deedee801967 100644 --- a/llvm/test/CodeGen/X86/vec_splat.ll +++ b/llvm/test/CodeGen/X86/vec_splat.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse3 | grep movddup +; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] @@ -10,6 +10,12 @@ define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { %tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1] store <4 x float> %tmp10, <4 x float>* %P ret void + +; SSE2: test_v4sf: +; SSE2: pshufd $0 + +; SSE3: test_v4sf: +; SSE3: pshufd $0 } define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { @@ -19,4 +25,10 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { %tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1] store <2 x double> %tmp6, <2 x double>* %P ret void + +; SSE2: test_v2sd: +; SSE2: shufpd $0 + +; SSE3: test_v2sd: +; SSE3: movddup }