Fix PR 17368: disable vector mul distribution for square of add/sub for ARM

Generally, it is desirable to distribute (a + b) * c to a*c + b*c for
ARM with VMLx forwarding, where a, b and c are vectors.
However, for (a + b)*(a + b), distribution will result in one extra
instruction.
With distribution:
  x = a + b (add)
  y = a * x (mul)
  z = y + b * y (mla)

Without distribution:
  x = a + b (add)
  z = x * x (mul)

This patch checks if a mul is a square of add/sub. If yes, skip
distribution.

llvm-svn: 191410
This commit is contained in:
Weiming Zhao 2013-09-25 23:12:06 +00:00
parent a9e303e746
commit 2052f4843b
2 changed files with 21 additions and 0 deletions

View File

@ -8342,6 +8342,13 @@ static SDValue PerformSUBCombine(SDNode *N,
/// is faster than
/// vadd d3, d0, d1
/// vmul d3, d3, d2
// However, for (A + B) * (A + B),
// vadd d2, d0, d1
// vmul d3, d0, d2
// vmla d3, d1, d2
// is slower than
// vadd d2, d0, d1
// vmul d3, d2, d2
static SDValue PerformVMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@ -8361,6 +8368,9 @@ static SDValue PerformVMULCombine(SDNode *N,
std::swap(N0, N1);
}
if (N0 == N1)
return SDValue();
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue N00 = N0->getOperand(0);

View File

@ -515,6 +515,17 @@ entry:
ret void
}
define <8 x i8> @no_distribute(<8 x i8> %a, <8 x i8> %b) nounwind {
entry:
; CHECK: no_distribute
; CHECK: vadd.i8
; CHECK: vmul.i8
; CHECK-NOT: vmla.i8
%0 = add <8 x i8> %a, %b
%1 = mul <8x i8> %0, %0
ret <8 x i8> %1
}
; If one operand has a zero-extend and the other a sign-extend, vmull
; cannot be used.
define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {