Match VTRN, VZIP, and VUZP shuffles. Restore the tests for these operations,

now using shuffles instead of intrinsics.

llvm-svn: 79673
This commit is contained in:
Bob Wilson 2009-08-21 20:54:19 +00:00
parent f31a44ec01
commit a70623102e
5 changed files with 331 additions and 9 deletions

View File

@ -2403,6 +2403,53 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
return true;
}
static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((unsigned) M[i] != i + WhichResult ||
(unsigned) M[i+1] != i + NumElts + WhichResult)
return false;
}
return true;
}
static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i != NumElts; ++i) {
if ((unsigned) M[i] != 2 * i + WhichResult)
return false;
}
// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
return false;
return true;
}
static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((unsigned) M[i] != Idx ||
(unsigned) M[i+1] != Idx + NumElts)
return false;
Idx += 1;
}
// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
return false;
return true;
}
static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) {
// Canonicalize all-zeros and all-ones vectors.
ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode());
@ -2504,13 +2551,16 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
}
bool ReverseVEXT;
unsigned Imm;
unsigned Imm, WhichResult;
return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isVREVMask(M, VT, 64) ||
isVREVMask(M, VT, 32) ||
isVREVMask(M, VT, 16) ||
isVEXTMask(M, VT, ReverseVEXT, Imm));
isVEXTMask(M, VT, ReverseVEXT, Imm) ||
isVTRNMask(M, VT, WhichResult) ||
isVUZPMask(M, VT, WhichResult) ||
isVZIPMask(M, VT, WhichResult));
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@ -2610,11 +2660,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
bool ReverseVEXT;
unsigned Imm;
if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
SDValue Op0 = SVN->getOperand(0);
SDValue Op1 = SVN->getOperand(1);
if (ReverseVEXT)
std::swap(Op0, Op1);
return DAG.getNode(ARMISD::VEXT, dl, VT, Op0, Op1,
std::swap(V1, V2);
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
DAG.getConstant(Imm, MVT::i32));
}
@ -2625,6 +2673,24 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
if (isVREVMask(ShuffleMask, VT, 16))
return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
// Check for Neon shuffles that modify both input vectors in place.
// If both results are used, i.e., if there are two shuffles with the same
// source operands and with masks corresponding to both results of one of
// these operations, DAG memoization will ensure that a single node is
// used for both shuffles.
unsigned WhichResult;
if (isVTRNMask(ShuffleMask, VT, WhichResult))
return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
V1, V2).getValue(WhichResult);
if (isVUZPMask(ShuffleMask, VT, WhichResult))
return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
V1, V2).getValue(WhichResult);
if (isVZIPMask(ShuffleMask, VT, WhichResult))
return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
V1, V2).getValue(WhichResult);
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];

View File

@ -132,9 +132,8 @@ namespace llvm {
VREV64, // reverse elements within 64-bit doublewords
VREV32, // reverse elements within 32-bit words
VREV16, // reverse elements within 16-bit halfwords
VZIP, // zip
VUZP, // unzip
VZIP, // zip (interleave)
VUZP, // unzip (deinterleave)
VTRN // transpose
};
}

View File

@ -0,0 +1,99 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> }
%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> }
%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> }
%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> }
%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> }
%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> }
%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> }
%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> }
define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vtrni8:
;CHECK: vtrn.8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}
define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vtrni16:
;CHECK: vtrn.16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%tmp5 = add <4 x i16> %tmp3, %tmp4
ret <4 x i16> %tmp5
}
define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vtrni32:
;CHECK: vtrn.32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
%tmp5 = add <2 x i32> %tmp3, %tmp4
ret <2 x i32> %tmp5
}
define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vtrnf:
;CHECK: vtrn.32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
%tmp5 = add <2 x float> %tmp3, %tmp4
ret <2 x float> %tmp5
}
define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vtrnQi8:
;CHECK: vtrn.8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
%tmp5 = add <16 x i8> %tmp3, %tmp4
ret <16 x i8> %tmp5
}
define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vtrnQi16:
;CHECK: vtrn.16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%tmp5 = add <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vtrnQi32:
;CHECK: vtrn.32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vtrnQf:
;CHECK: vtrn.32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%tmp5 = add <4 x float> %tmp3, %tmp4
ret <4 x float> %tmp5
}

View File

@ -0,0 +1,79 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> }
%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> }
%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> }
%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> }
%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> }
%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> }
%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> }
%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> }
define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vuzpi8:
;CHECK: vuzp.8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}
define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vuzpi16:
;CHECK: vuzp.16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%tmp5 = add <4 x i16> %tmp3, %tmp4
ret <4 x i16> %tmp5
}
; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vuzpQi8:
;CHECK: vuzp.8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%tmp5 = add <16 x i8> %tmp3, %tmp4
ret <16 x i8> %tmp5
}
define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vuzpQi16:
;CHECK: vuzp.16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%tmp5 = add <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vuzpQi32:
;CHECK: vuzp.32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vuzpQf:
;CHECK: vuzp.32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%tmp5 = add <4 x float> %tmp3, %tmp4
ret <4 x float> %tmp5
}

View File

@ -0,0 +1,79 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> }
%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> }
%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> }
%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> }
%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> }
%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> }
%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> }
%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> }
define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vzipi8:
;CHECK: vzip.8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%tmp5 = add <8 x i8> %tmp3, %tmp4
ret <8 x i8> %tmp5
}
define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vzipi16:
;CHECK: vzip.16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%tmp5 = add <4 x i16> %tmp3, %tmp4
ret <4 x i16> %tmp5
}
; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors.
define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vzipQi8:
;CHECK: vzip.8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
%tmp5 = add <16 x i8> %tmp3, %tmp4
ret <16 x i8> %tmp5
}
define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vzipQi16:
;CHECK: vzip.16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%tmp5 = add <8 x i16> %tmp3, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vzipQi32:
;CHECK: vzip.32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%tmp5 = add <4 x i32> %tmp3, %tmp4
ret <4 x i32> %tmp5
}
define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vzipQf:
;CHECK: vzip.32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%tmp5 = add <4 x float> %tmp3, %tmp4
ret <4 x float> %tmp5
}