AVX-512: added calling convention for i1 vectors in 32-bit mode.

Fixed some bugs in extend/truncate for AVX-512 target.
Removed VBROADCASTM (masked broadcast) node, since it is not used any more.

llvm-svn: 236420
This commit is contained in:
Elena Demikhovsky 2015-05-04 12:40:50 +00:00
parent fa4de125c9
commit 60eb9db7bb
6 changed files with 103 additions and 60 deletions

View File

@ -268,7 +268,7 @@ def CC_X86_64_C : CallingConv<[
CCIfSubtarget<"hasSSE2()", CCIfSubtarget<"hasSSE2()",
CCPromoteToType<v2i64>>>>, CCPromoteToType<v2i64>>>>,
// Boolean vectors of AVX-512 are returned in SIMD registers. // Boolean vectors of AVX-512 are passed in SIMD registers.
// The call from AVX to AVX-512 function should work, // The call from AVX to AVX-512 function should work,
// since the boolean types in AVX/AVX2 are promoted by default. // since the boolean types in AVX/AVX2 are promoted by default.
CCIfType<[v2i1], CCPromoteToType<v2i64>>, CCIfType<[v2i1], CCPromoteToType<v2i64>>,
@ -472,6 +472,16 @@ def CC_X86_32_Common : CallingConv<[
// Long doubles get slots whose size depends on the subtarget. // Long doubles get slots whose size depends on the subtarget.
CCIfType<[f80], CCAssignToStack<0, 4>>, CCIfType<[f80], CCAssignToStack<0, 4>>,
// Boolean vectors of AVX-512 are passed in SIMD registers.
// The call from AVX to AVX-512 function should work,
// since the boolean types in AVX/AVX2 are promoted by default.
CCIfType<[v2i1], CCPromoteToType<v2i64>>,
CCIfType<[v4i1], CCPromoteToType<v4i32>>,
CCIfType<[v8i1], CCPromoteToType<v8i16>>,
CCIfType<[v16i1], CCPromoteToType<v16i8>>,
CCIfType<[v32i1], CCPromoteToType<v32i8>>,
CCIfType<[v64i1], CCPromoteToType<v64i8>>,
// The first 4 SSE vector arguments are passed in XMM registers. // The first 4 SSE vector arguments are passed in XMM registers.
CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,

View File

@ -1312,6 +1312,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
@ -2078,7 +2080,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// Copy all of the result registers out of their specified physreg. // Copy all of the result registers out of their specified physreg.
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i]; CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT(); EVT CopyVT = VA.getLocVT();
// If this is x86-64, and we disabled SSE, we can't return FP values // If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
@ -2088,15 +2090,18 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If we prefer to use the value in xmm registers, copy it out as f80 and // If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg. // use a truncate to move it from fp stack reg to xmm reg.
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) isScalarFPTypeInSSEReg(VA.getValVT())) {
CopyVT = MVT::f80; CopyVT = MVT::f80;
RoundAfterCopy = (CopyVT != VA.getLocVT());
}
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
CopyVT, InFlag).getValue(1); CopyVT, InFlag).getValue(1);
SDValue Val = Chain.getValue(0); SDValue Val = Chain.getValue(0);
if (CopyVT != VA.getValVT()) if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value. // This truncation won't change the value.
DAG.getIntPtrConstant(1, dl)); DAG.getIntPtrConstant(1, dl));
@ -2825,7 +2830,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
break; break;
case CCValAssign::AExt: case CCValAssign::AExt:
if (RegVT.is128BitVector()) { if (Arg.getValueType().getScalarType() == MVT::i1)
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers. // Special case: passing MMX values in XMM registers.
Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
@ -11969,6 +11976,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT InVT = In.getSimpleValueType(); MVT InVT = In.getSimpleValueType();
SDLoc dl(Op); SDLoc dl(Op);
if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
// Optimize vectors in AVX mode: // Optimize vectors in AVX mode:
// //
// v8i16 -> v8i32 // v8i16 -> v8i32
@ -12018,22 +12028,17 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
return DAG.getNode(X86ISD::VZEXT, DL, VT, In); return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Now we have only mask extension
assert(InVT.getVectorElementType() == MVT::i1); assert(InVT.getVectorElementType() == MVT::i1);
SDValue Cst = DAG.getTargetConstant(1, DL, ExtVT.getScalarType()); MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue(); SDValue One =
SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); SDValue Zero =
SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
if (VT.is512BitVector()) if (VT.is512BitVector())
return Brcst; return V;
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
} }
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
@ -12115,14 +12120,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
InVT = ExtVT; InVT = ExtVT;
} }
SDValue Cst = DAG.getTargetConstant(1, DL, InVT.getVectorElementType()); SDValue OneV =
const Constant *C = cast<ConstantSDNode>(Cst)->getConstantIntValue(); DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
SDValue CP = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
} }
@ -13802,22 +13801,18 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
return DAG.getNode(X86ISD::VSEXT, dl, VT, In); return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
} }
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
SDValue NegOne =
DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
ExtVT);
SDValue Zero =
DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
Constant *C = ConstantInt::get(*DAG.getContext(),
APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
if (VT.is512BitVector()) if (VT.is512BitVector())
return Brcst; return V;
return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
} }
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
@ -17860,7 +17855,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";

View File

@ -371,8 +371,6 @@ namespace llvm {
VPERMI, VPERMI,
VPERM2X128, VPERM2X128,
VBROADCAST, VBROADCAST,
// masked broadcast
VBROADCASTM,
// Insert/Extract vector element // Insert/Extract vector element
VINSERT, VINSERT,
VEXTRACT, VEXTRACT,

View File

@ -802,12 +802,8 @@ def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
(VPBROADCASTDrZr GR32:$src)>; (VPBROADCASTDrZr GR32:$src)>;
def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))),
(VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>;
def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
(VPBROADCASTQrZr GR64:$src)>; (VPBROADCASTQrZr GR64:$src)>;
def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
(VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>;
def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
(VPBROADCASTDrZr GR32:$src)>; (VPBROADCASTDrZr GR32:$src)>;
@ -829,24 +825,33 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, [(set DstRC:$dst,
(OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
VR128X:$src),
!strconcat(OpcodeStr,
"\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"),
[]>, EVEX, EVEX_K;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
VR128X:$src), VR128X:$src),
!strconcat(OpcodeStr, !strconcat(OpcodeStr,
"\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
[(set DstRC:$dst, []>, EVEX, EVEX_KZ;
(OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
EVEX, EVEX_KZ;
let mayLoad = 1 in { let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, [(set DstRC:$dst,
(OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
x86memop:$src),
!strconcat(OpcodeStr,
"\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"),
[]>, EVEX, EVEX_K;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
x86memop:$src), x86memop:$src),
!strconcat(OpcodeStr, !strconcat(OpcodeStr,
"\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
[(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, [(set DstRC:$dst, (OpVT (vselect KRC:$mask,
(ld_frag addr:$src))))]>, EVEX, EVEX_KZ; (X86VBroadcast (ld_frag addr:$src)),
(OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ;
} }
} }
@ -907,12 +912,6 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
(VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
let Predicates = [HasAVX512] in {
def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
(EXTRACT_SUBREG
(v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
addr:$src)), sub_ymm)>;
}
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// AVX-512 BROADCAST MASK TO VECTOR REGISTER // AVX-512 BROADCAST MASK TO VECTOR REGISTER
//--- //---

View File

@ -273,7 +273,6 @@ def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,

View File

@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
; KNL-LABEL: test1 ; KNL-LABEL: test1
; KNL: vxorps ; KNL: vxorps
@ -42,3 +43,45 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
ret <4 x i1> %c ret <4 x i1> %c
} }
; SKX-LABEL: test5
; SKX: vpcmpgtd
; SKX: vpmovm2w
; SKX: call
; SKX: vpmovzxwd
declare <8 x i1> @func8xi1(<8 x i1> %a)
define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
%cmpRes = icmp sgt <8 x i32>%a, %b
%resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes)
%res = sext <8 x i1>%resi to <8 x i32>
ret <8 x i32> %res
}
declare <16 x i1> @func16xi1(<16 x i1> %a)
; KNL-LABEL: test6
; KNL: vpbroadcastd
; KNL: vpmovdb
; KNL: call
; KNL: vpmovzxbd
; KNL: vpslld $31, %zmm
; KNL: vpsrad $31, %zmm
define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
%cmpRes = icmp sgt <16 x i32>%a, %b
%resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes)
%res = sext <16 x i1>%resi to <16 x i32>
ret <16 x i32> %res
}
declare <4 x i1> @func4xi1(<4 x i1> %a)
; SKX-LABEL: test7
; SKX: vpmovm2d
; SKX: call
; SKX: vpslld $31, %xmm
; SKX: vpsrad $31, %xmm
define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
%cmpRes = icmp sgt <4 x i32>%a, %b
%resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes)
%res = sext <4 x i1>%resi to <4 x i32>
ret <4 x i32> %res
}