From 7d86e47d04a76362c2621f6aa2f82b83eb068347 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <rsandifo@linux.vnet.ibm.com>
Date: Wed, 21 Aug 2013 09:34:56 +0000
Subject: [PATCH] [SystemZ] Define remainig *MUL_LOHI patterns

The initial port used MLG(R) for i64 UMUL_LOHI but left the other three
combinations as not-legal-or-custom.  Although 32x32->{32,32}
multiplications exist, they're not as quick as doing a normal 64-bit
multiplication, so it didn't seem like i32 SMUL_LOHI and UMUL_LOHI
would be useful.  There's also no direct instruction for i64 SMUL_LOHI,
so it needs to be implemented in terms of UMUL_LOHI.

However, not defining these patterns means that we don't convert
division by a constant into multiplication, so this patch fills
in the other cases.  The new i64 SMUL_LOHI sequence is simpler
than the one that we used previously for 64x64->128 multiplication,
so int-mul-08.ll now tests the full sequence.

llvm-svn: 188898
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 88 +++++++++++++++----
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |  1 +
 llvm/test/CodeGen/SystemZ/int-div-06.ll       | 56 ++++++++++++
 llvm/test/CodeGen/SystemZ/int-mul-08.ll       | 10 ++-
 4 files changed, 136 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/int-div-06.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 7772b9ed443e..a1eecd736ce9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -128,9 +128,11 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::ROTR,            VT, Expand);
 
-      // Use *MUL_LOHI where possible and a wider multiplication otherwise.
+      // Use *MUL_LOHI where possible instead of MULH*.
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // We have instructions for signed but not unsigned FP conversion.
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
@@ -165,14 +167,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // The architecture has 32-bit SMUL_LOHI and UMUL_LOHI (MR and MLR),
-  // but they aren't really worth using.  There is no 64-bit SMUL_LOHI,
-  // but there is a 64-bit UMUL_LOHI: MLGR.
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
-
   // FIXME: Can we support these natively?
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
@@ -1142,6 +1136,20 @@ static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
                      DL, MVT::Glue, CmpOp0, CmpOp1);
 }
 
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
+                            unsigned Extend, SDValue Op0, SDValue Op1,
+                            SDValue &Hi, SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
+}
+
 // Lower a binary operation that produces two VT results, one in each
 // half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
 // Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
@@ -1427,18 +1435,64 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
 SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
-
-  // UMUL_LOHI64 returns the low result in the odd register and the high
-  // result in the even register.  UMUL_LOHI is defined to return the
-  // low half first, so the results are in reverse order.
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
-                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
@@ -1706,6 +1760,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerVACOPY(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
     return lowerUMUL_LOHI(Op, DAG);
   case ISD::SDIVREM:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3692e1e053b9..604453d2ddea 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -214,6 +214,7 @@ private:
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/SystemZ/int-div-06.ll b/llvm/test/CodeGen/SystemZ/int-div-06.ll
new file mode 100644
index 000000000000..8576b1b6270a
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/int-div-06.ll
@@ -0,0 +1,56 @@
+; Test that divisions by constants are implemented as multiplications.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Check signed 32-bit division.
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: lgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK-DAG: srlg [[RES1:%r[0-5]]], [[REG]], 63
+; CHECK-DAG: srag %r2, [[REG]], 46
+; CHECK: ar %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check unsigned 32-bit division.
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK: srlg %r2, [[REG]], 46
+; CHECK: br %r14
+  %b = udiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check signed 64-bit division.
+define i64 @f3(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK-DAG: oilf [[CONST]], 4251762321
+; CHECK-DAG: srag [[REG:%r[0-5]]], %r3, 63
+; CHECK-DAG: ngr [[REG]], [[CONST]]
+; CHECK-DAG: mlgr %r2, [[CONST]]
+; CHECK: sgr %r2, [[REG]]
+; CHECK: srlg [[RES1:%r[0-5]]], %r2, 63
+; CHECK: srag %r2, %r2, 15
+; CHECK: agr %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i64 %a, 139968
+  ret i64 %b
+}
+
+; Check unsigned 64-bit division.
+define i64 @f4(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK: oilf [[CONST]], 4251762321
+; CHECK: mlgr %r2, [[CONST]]
+; CHECK: srlg %r2, %r2, 15
+; CHECK: br %r14
+  %b = udiv i64 %a, 139968
+  ret i64 %b
+}
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-08.ll b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
index a245760e1809..90b26a4f3dde 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
@@ -22,9 +22,13 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
 ; This needs a rather convoluted sequence.
 define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: mlgr
-; CHECK: agr
-; CHECK: agr
+; CHECK-DAG: srag [[RES1:%r[0-5]]], %r3, 63
+; CHECK-DAG: srag [[RES2:%r[0-5]]], %r4, 63
+; CHECK-DAG: ngr [[RES1]], %r4
+; CHECK-DAG: ngr [[RES2]], %r3
+; CHECK-DAG: agr [[RES2]], [[RES1]]
+; CHECK-DAG: mlgr %r2, %r4
+; CHECK: sgr %r2, [[RES2]]
 ; CHECK: br %r14
   %ax = sext i64 %a to i128
   %bx = sext i64 %b to i128