diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 2566194ca9c6..b931f606ee55 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -400,6 +400,76 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef Values, MachinePointerInfo::getConstantPool(MF), Align); } + // A special case is a situation where the vector is built entirely from + // elements extracted from another vector. This could be done via a shuffle + // more efficiently, but typically, the size of the source vector will not + // match the size of the vector being built (which precludes the use of a + // shuffle directly). + // This only handles a single source vector, and the vector being built + // should be of a sub-vector type of the source vector type. + auto IsBuildFromExtracts = [this,&Values] (SDValue &SrcVec, + SmallVectorImpl &SrcIdx) { + SDValue Vec; + for (SDValue V : Values) { + if (isUndef(V)) { + SrcIdx.push_back(-1); + continue; + } + if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + // All extracts should come from the same vector. + SDValue T = V.getOperand(0); + if (Vec.getNode() != nullptr && T.getNode() != Vec.getNode()) + return false; + Vec = T; + ConstantSDNode *C = dyn_cast(V.getOperand(1)); + if (C == nullptr) + return false; + int I = C->getSExtValue(); + assert(I >= 0 && "Negative element index"); + SrcIdx.push_back(I); + } + SrcVec = Vec; + return true; + }; + + SmallVector ExtIdx; + SDValue ExtVec; + if (IsBuildFromExtracts(ExtVec, ExtIdx)) { + MVT ExtTy = ty(ExtVec); + unsigned ExtLen = ExtTy.getVectorNumElements(); + if (ExtLen == VecLen || ExtLen == 2*VecLen) { + // Construct a new shuffle mask that will produce a vector with the same + // number of elements as the input vector, and such that the vector we + // want will be the initial subvector of it. + SmallVector Mask; + BitVector Used(ExtLen); + + for (int M : ExtIdx) { + Mask.push_back(M); + if (M >= 0) + Used.set(M); + } + // Fill the rest of the mask with the unused elements of ExtVec in hopes + // that it will result in a permutation of ExtVec's elements. It's still + // fine if it doesn't (e.g. if undefs are present, or elements are + // repeated), but permutations can always be done efficiently via vdelta + // and vrdelta. + for (unsigned I = 0; I != ExtLen; ++I) { + if (Mask.size() == ExtLen) + break; + if (!Used.test(I)) + Mask.push_back(I); + } + + SDValue S = DAG.getVectorShuffle(ExtTy, dl, ExtVec, + DAG.getUNDEF(ExtTy), Mask); + if (ExtLen == VecLen) + return S; + return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, S); + } + } + // Construct two halves in parallel, then or them together. assert(4*Words.size() == Subtarget.getVectorLength()); SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG); diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-shuffle-gather.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-shuffle-gather.ll new file mode 100644 index 000000000000..80b1e7b36cbb --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-shuffle-gather.ll @@ -0,0 +1,121 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; +; Several shufflevector instructions have masks that are shorter than the +; source vectors. They "gather" a subset of the input elements into a single +; vector. Make sure that they are not expanded into a sequence of extract/ +; insert operations. +; +; The C source: +; +; void fred(int *a, int *b, int n) { +; for (int i = 0; i != n; i += 2) { +; a[i] += b[i+1]; +; a[i+1] += b[i]; +; } +; } +; +; Command line: +; clang -target hexagon -mcpu=hexagonv60 -fvectorize -fno-unroll-loops -O2 \ +; -mhvx -mhvx-length=128b -S inp.c +; +; CHECK-NOT: vinsert + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Function Attrs: norecurse nounwind +define void @f0(i32* nocapture %a0, i32* nocapture readonly %a1, i32 %a2) #0 { +b0: + %v0 = icmp eq i32 %a2, 0 + br i1 %v0, label %b7, label %b1 + +b1: ; preds = %b0 + %v1 = add i32 %a2, -2 + %v2 = lshr i32 %v1, 1 + %v3 = add nuw i32 %v2, 1 + %v4 = icmp ult i32 %v3, 32 + br i1 %v4, label %b2, label %b3 + +b2: ; preds = %b6, %b3, %b1 + %v5 = phi i32 [ 0, %b3 ], [ 0, %b1 ], [ %v13, %b6 ] + br label %b8 + +b3: ; preds = %b1 + %v6 = and i32 %a2, -2 + %v7 = getelementptr i32, i32* %a0, i32 %v6 + %v8 = getelementptr i32, i32* %a1, i32 %v6 + %v9 = icmp ugt i32* %v8, %a0 + %v10 = icmp ugt i32* %v7, %a1 + %v11 = and i1 %v9, %v10 + br i1 %v11, label %b2, label %b4 + +b4: ; preds = %b3 + %v12 = and i32 %v3, -32 + %v13 = shl i32 %v12, 1 + br label %b5 + +b5: ; preds = %b5, %b4 + %v14 = phi i32 [ 0, %b4 ], [ %v34, %b5 ] + %v15 = shl i32 %v14, 1 + %v16 = or i32 %v15, 1 + %v17 = getelementptr inbounds i32, i32* %a1, i32 -1 + %v18 = getelementptr inbounds i32, i32* %v17, i32 %v16 + %v19 = bitcast i32* %v18 to <64 x i32>* + %v20 = load <64 x i32>, <64 x i32>* %v19, align 4, !tbaa !1 + %v21 = shufflevector <64 x i32> %v20, <64 x i32> undef, <32 x i32> + %v22 = shufflevector <64 x i32> %v20, <64 x i32> undef, <32 x i32> + %v23 = getelementptr inbounds i32, i32* %a0, i32 %v15 + %v24 = bitcast i32* %v23 to <64 x i32>* + %v25 = load <64 x i32>, <64 x i32>* %v24, align 4, !tbaa !1 + %v26 = shufflevector <64 x i32> %v25, <64 x i32> undef, <32 x i32> + %v27 = shufflevector <64 x i32> %v25, <64 x i32> undef, <32 x i32> + %v28 = add nsw <32 x i32> %v26, %v22 + %v29 = getelementptr inbounds i32, i32* %a0, i32 -1 + %v30 = add nsw <32 x i32> %v27, %v21 + %v31 = getelementptr inbounds i32, i32* %v29, i32 %v16 + %v32 = bitcast i32* %v31 to <64 x i32>* + %v33 = shufflevector <32 x i32> %v28, <32 x i32> %v30, <64 x i32> + store <64 x i32> %v33, <64 x i32>* %v32, align 4, !tbaa !1 + %v34 = add i32 %v14, 32 + %v35 = icmp eq i32 %v34, %v12 + br i1 %v35, label %b6, label %b5, !llvm.loop !5 + +b6: ; preds = %b5 + %v36 = icmp eq i32 %v3, %v12 + br i1 %v36, label %b7, label %b2 + +b7: ; preds = %b8, %b6, %b0 + ret void + +b8: ; preds = %b8, %b2 + %v37 = phi i32 [ %v49, %b8 ], [ %v5, %b2 ] + %v38 = or i32 %v37, 1 + %v39 = getelementptr inbounds i32, i32* %a1, i32 %v38 + %v40 = load i32, i32* %v39, align 4, !tbaa !1 + %v41 = getelementptr inbounds i32, i32* %a0, i32 %v37 + %v42 = load i32, i32* %v41, align 4, !tbaa !1 + %v43 = add nsw i32 %v42, %v40 + store i32 %v43, i32* %v41, align 4, !tbaa !1 + %v44 = getelementptr inbounds i32, i32* %a1, i32 %v37 + %v45 = load i32, i32* %v44, align 4, !tbaa !1 + %v46 = getelementptr inbounds i32, i32* %a0, i32 %v38 + %v47 = load i32, i32* %v46, align 4, !tbaa !1 + %v48 = add nsw i32 %v47, %v45 + store i32 %v48, i32* %v46, align 4, !tbaa !1 + %v49 = add nuw nsw i32 %v37, 2 + %v50 = icmp eq i32 %v49, %a2 + br i1 %v50, label %b7, label %b8, !llvm.loop !7 +} + +attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv60" } + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.isvectorized", i32 1} +!7 = distinct !{!7, !6}