[NVPTX] Fix sign/zero-extending ldg/ldu instruction selection

Summary:
We don't have sign-/zero-extending ldg/ldu instructions defined,
so we need to emulate them with explicit CVTs. We were originally
handling the i8 case, but not any other cases.

Fixes PR26185

Reviewers: jingyue, jlebar

Subscribers: jholewinski

Differential Revision: http://reviews.llvm.org/D19615

llvm-svn: 268272
This commit is contained in:
Justin Holewinski 2016-05-02 18:12:02 +00:00
parent b2bd28128d
commit 9a6ea2c256
4 changed files with 109 additions and 46 deletions

View File

@ -2062,61 +2062,33 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
//
// i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
//
// Since we load an i8 value, the matching logic above will have selected an
// LDG instruction that reads i8 and stores it in an i16 register (NVPTX does
// not expose 8-bit registers):
//
// i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0
//
// To get the correct type in this case, truncate back to i8 and then extend
// to the original load type.
// In this case, the matching logic above will select a load for the original
// memory type (in this case, i8) and our types will not match (the node needs
// to return an i32 in this case). Our LDG/LDU nodes do not support the
// concept of sign-/zero-extension, so emulate it here by adding an explicit
// CVT instruction. Ptxas should clean up any redundancies here.
EVT OrigType = N->getValueType(0);
LoadSDNode *LDSD = dyn_cast<LoadSDNode>(N);
if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) {
unsigned CvtOpc = 0;
LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
switch (LDSD->getExtensionType()) {
default:
llvm_unreachable("An extension is required for i8 loads");
break;
case ISD::SEXTLOAD:
switch (OrigType.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unhandled integer load type");
break;
case MVT::i32:
CvtOpc = NVPTX::CVT_s32_s8;
break;
case MVT::i64:
CvtOpc = NVPTX::CVT_s64_s8;
break;
}
break;
case ISD::EXTLOAD:
case ISD::ZEXTLOAD:
switch (OrigType.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unhandled integer load type");
break;
case MVT::i32:
CvtOpc = NVPTX::CVT_u32_u8;
break;
case MVT::i64:
CvtOpc = NVPTX::CVT_u64_u8;
break;
}
break;
}
if (OrigType != EltVT && LdNode) {
// We have an extending-load. The instruction we selected operates on the
// smaller type, but the SDNode we are replacing has the larger type. We
// need to emit a CVT to make the types match.
bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
EltVT.getSimpleVT(), IsSigned);
// For each output value, truncate to i8 (since the upper 8 bits are
// undefined) and then extend to the desired type.
// For each output value, apply the manual sign/zero-extension and make sure
// all users of the load go through that CVT.
for (unsigned i = 0; i != NumElts; ++i) {
SDValue Res(LD, i);
SDValue OrigVal(N, i);
SDNode *CvtNode =
CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32));
CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
DL, MVT::i32));
ReplaceUses(OrigVal, SDValue(CvtNode, 0));
}
}
@ -5199,3 +5171,57 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
}
return true;
}
/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
/// conversion from \p SrcTy to \p DestTy.
unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
bool IsSigned) {
switch (SrcTy.SimpleTy) {
default:
llvm_unreachable("Unhandled source type");
case MVT::i8:
switch (DestTy.SimpleTy) {
default:
llvm_unreachable("Unhandled dest type");
case MVT::i16:
return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
case MVT::i32:
return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
case MVT::i64:
return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
}
case MVT::i16:
switch (DestTy.SimpleTy) {
default:
llvm_unreachable("Unhandled dest type");
case MVT::i8:
return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
case MVT::i32:
return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
case MVT::i64:
return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
}
case MVT::i32:
switch (DestTy.SimpleTy) {
default:
llvm_unreachable("Unhandled dest type");
case MVT::i8:
return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
case MVT::i16:
return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
case MVT::i64:
return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
}
case MVT::i64:
switch (DestTy.SimpleTy) {
default:
llvm_unreachable("Unhandled dest type");
case MVT::i8:
return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
case MVT::i16:
return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
case MVT::i32:
return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
}
}
}

View File

@ -93,6 +93,7 @@ private:
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
};
} // end namespace llvm

View File

@ -377,6 +377,8 @@ let hasSideEffects = 0 in {
}
// Generate cvts from all types to all types.
defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;

View File

@ -0,0 +1,34 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
; Verify that we correctly emit code for extending ldg/ldu. We do not expose
; extending variants in the backend, but the ldg/ldu selection code may pick
; extending loads as candidates. We do want to support this, so make sure we
; emit the necessary cvt.* instructions to implement the extension and let ptxas
; emit the real extending loads.
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
; CHECK-LABEL: spam
define ptx_kernel void @spam(i8 addrspace(1)* noalias nocapture readonly %arg, i8 addrspace(1)* noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
bb:
%tmp = bitcast i8 addrspace(1)* %arg to i16 addrspace(1)*
%tmp4 = bitcast i8 addrspace(1)* %arg1 to i64 addrspace(1)*
%tmp5 = add nsw i64 %arg3, 8
%tmp6 = getelementptr i16, i16 addrspace(1)* %tmp, i64 %tmp5
; CHECK: ld.global.nc.u16
%tmp7 = load i16, i16 addrspace(1)* %tmp6, align 2
; CHECK: cvt.s32.s16
%tmp8 = sext i16 %tmp7 to i64
%tmp9 = mul nsw i64 %tmp8, %tmp8
%tmp10 = load i64, i64 addrspace(1)* %tmp4, align 8
%tmp11 = add nsw i64 %tmp9, %tmp10
store i64 %tmp11, i64 addrspace(1)* %tmp4, align 8
ret void
}
attributes #0 = { norecurse nounwind "polly.skip.fn" }
!nvvm.annotations = !{!0}
!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64)* @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}