diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 65c5552de2af..ebe1a8262665 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -73,6 +73,8 @@ def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", [Feature64Bit]>; def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; +def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", + "SHLD instruction is slow">; def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "IsUAMemFast", "true", "Fast unaligned memory access">; @@ -268,46 +270,53 @@ def : ProcessorModel<"knl", HaswellModel, def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; def : Proc<"k6-3", [Feature3DNow]>; -def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, + FeatureSlowSHLD]>; def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; + FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem]>; + FeaturePOPCNT, FeatureSlowBTMem, + FeatureSlowSHLD]>; // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>; + FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowSHLD]>; // Jaguar def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT]>; + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureLZCNT, FeaturePOPCNT]>; + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; // Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureTBM, - FeatureFMA]>; + FeaturePOPCNT, FeatureBMI, FeatureTBM, + FeatureFMA, FeatureSlowSHLD]>; // Steamroller def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ec5ae33ef584..23910a350e86 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17892,6 +17892,18 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) + MachineFunction &MF = DAG.getMachineFunction(); + bool OptForSize = MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // SHLD/SHRD instructions have lower register pressure, but on some + // platforms they have higher latency than the equivalent + // series of shifts/or that would otherwise be generated. + // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions + // have higer latencies and we are not optimizing for size. + if (!OptForSize && Subtarget->isSHLDSlow()) + return SDValue(); + if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 05db662b2c4a..fa04c38a852b 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -263,6 +263,15 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { ToggleFeature(X86::FeatureSlowBTMem); } + // Determine if SHLD/SHRD instructions have higher latency then the + // equivalent series of shifts/or instructions. + // FIXME: Add Intel's processors that have SHLD instructions with very + // poor latency. + if (IsAMD) { + IsSHLDSlow = true; + ToggleFeature(X86::FeatureSlowSHLD); + } + // If it's an Intel chip since Nehalem and not an Atom chip, unaligned // memory access is fast. We hard code model numbers here because they // aren't strictly increasing for Intel chips it seems. @@ -514,6 +523,7 @@ void X86Subtarget::initializeEnvironment() { HasPRFCHW = false; HasRDSEED = false; IsBTMemSlow = false; + IsSHLDSlow = false; IsUAMemFast = false; HasVectorUAMem = false; HasCmpxchg16b = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index dd8c0811ce51..cddd9d898ebb 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -142,6 +142,9 @@ protected: /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; + /// IsSHLDSlow - True if SHLD instructions are slow. + bool IsSHLDSlow; + /// IsUAMemFast - True if unaligned memory access is fast. bool IsUAMemFast; @@ -292,6 +295,7 @@ public: bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } bool isBTMemSlow() const { return IsBTMemSlow; } + bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } diff --git a/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll new file mode 100644 index 000000000000..f2380f23b8ee --- /dev/null +++ b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s +; Verify that for the architectures that are known to have poor latency +; double precision shift instructions we generate alternative sequence +; of instructions with lower latencies instead of shld instruction. + +;uint64_t lshift1(uint64_t a, uint64_t b) +;{ +; return (a << 1) | (b >> 63); +;} + +; CHECK: lshift1: +; CHECK: addq {{.*}},{{.*}} +; CHECK-NEXT: shrq $63, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + + +define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable { +entry: + %shl = shl i64 %a, 1 + %shr = lshr i64 %b, 63 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t lshift2(uint64_t a, uint64_t b) +;{ +; return (a << 2) | (b >> 62); +;} + +; CHECK: lshift2: +; CHECK: shlq $2, {{.*}} +; CHECK-NEXT: shrq $62, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + +define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable { +entry: + %shl = shl i64 %a, 2 + %shr = lshr i64 %b, 62 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t lshift7(uint64_t a, uint64_t b) +;{ +; return (a << 7) | (b >> 57); +;} + +; CHECK: lshift7: +; CHECK: shlq $7, {{.*}} +; CHECK-NEXT: shrq $57, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + +define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable { +entry: + %shl = shl i64 %a, 7 + %shr = lshr i64 %b, 57 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t lshift63(uint64_t a, uint64_t b) +;{ +; return (a << 63) | (b >> 1); +;} + +; CHECK: lshift63: +; CHECK: shlq $63, {{.*}} +; CHECK-NEXT: shrq {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + +define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable { +entry: + %shl = shl i64 %a, 63 + %shr = lshr i64 %b, 1 + %or = or i64 %shr, %shl + ret i64 %or +} diff --git a/llvm/test/CodeGen/X86/x86-64-double-precision-shift-right.ll b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-right.ll new file mode 100644 index 000000000000..5edaad89df4c --- /dev/null +++ b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-right.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s +; Verify that for the architectures that are known to have poor latency +; double precision shift instructions we generate alternative sequence +; of instructions with lower latencies instead of shrd instruction. + +;uint64_t rshift1(uint64_t a, uint64_t b) +;{ +; return (a >> 1) | (b << 63); +;} + +; CHECK: rshift1: +; CHECK: shrq {{.*}} +; CHECK-NEXT: shlq $63, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + +define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable { + %1 = lshr i64 %a, 1 + %2 = shl i64 %b, 63 + %3 = or i64 %2, %1 + ret i64 %3 +} + +;uint64_t rshift2(uint64_t a, uint64_t b) +;{ +; return (a >> 2) | (b << 62); +;} + +; CHECK: rshift2: +; CHECK: shrq $2, {{.*}} +; CHECK-NEXT: shlq $62, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + + +define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable { + %1 = lshr i64 %a, 2 + %2 = shl i64 %b, 62 + %3 = or i64 %2, %1 + ret i64 %3 +} + +;uint64_t rshift7(uint64_t a, uint64_t b) +;{ +; return (a >> 7) | (b << 57); +;} + +; CHECK: rshift7: +; CHECK: shrq $7, {{.*}} +; CHECK-NEXT: shlq $57, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} + + +define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable { + %1 = lshr i64 %a, 7 + %2 = shl i64 %b, 57 + %3 = or i64 %2, %1 + ret i64 %3 +} + +;uint64_t rshift63(uint64_t a, uint64_t b) +;{ +; return (a >> 63) | (b << 1); +;} + +; CHECK: rshift63: +; CHECK: shrq $63, {{.*}} +; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} +; CHECK-NEXT: orq {{.*}}, {{.*}} + +define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable { + %1 = lshr i64 %a, 63 + %2 = shl i64 %b, 1 + %3 = or i64 %2, %1 + ret i64 %3 +} diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll new file mode 100644 index 000000000000..5d7a10b5901e --- /dev/null +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s + +; clang -Oz -c test1.cpp -emit-llvm -S -o +; Verify that we generate shld insruction when we are optimizing for size, +; even for X86_64 processors that are known to have poor latency double +; precision shift instuctions. +; uint64_t lshift10(uint64_t a, uint64_t b) +; { +; return (a << 10) | (b >> 54); +; } + +; Function Attrs: minsize nounwind optsize readnone uwtable +define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 { +entry: +; CHECK: shldq $10 + %shl = shl i64 %a, 10 + %shr = lshr i64 %b, 54 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + + +; clang -Os -c test2.cpp -emit-llvm -S +; Verify that we generate shld insruction when we are optimizing for size, +; even for X86_64 processors that are known to have poor latency double +; precision shift instuctions. +; uint64_t lshift11(uint64_t a, uint64_t b) +; { +; return (a << 11) | (b >> 53); +; } + +; Function Attrs: nounwind optsize readnone uwtable +define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 { +entry: +; CHECK: shldq $11 + %shl = shl i64 %a, 11 + %shr = lshr i64 %b, 53 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; clang -O2 -c test2.cpp -emit-llvm -S +; Verify that we do not generate shld insruction when we are not optimizing +; for size for X86_64 processors that are known to have poor latency double +; precision shift instuctions. +; uint64_t lshift12(uint64_t a, uint64_t b) +; { +; return (a << 12) | (b >> 52); +; } + +; Function Attrs: nounwind optsize readnone uwtable +define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 { +entry: +; CHECK: shlq $12 +; CHECK-NEXT: shrq $52 + %shl = shl i64 %a, 12 + %shr = lshr i64 %b, 52 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll new file mode 100644 index 000000000000..5bab434ae6a1 --- /dev/null +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s + +; Verify that for the X86_64 processors that are known to have poor latency +; double precision shift instructions we do not generate 'shld' or 'shrd' +; instructions. + +;uint64_t lshift(uint64_t a, uint64_t b, int c) +;{ +; return (a << c) | (b >> (64-c)); +;} + +define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +entry: +; CHECK-NOT: shld + %sh_prom = zext i32 %c to i64 + %shl = shl i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %shr = lshr i64 %b, %sh_prom1 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t rshift(uint64_t a, uint64_t b, int c) +;{ +; return (a >> c) | (b << (64-c)); +;} + +define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +entry: +; CHECK-NOT: shrd + %sh_prom = zext i32 %c to i64 + %shr = lshr i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %shl = shl i64 %b, %sh_prom1 + %or = or i64 %shl, %shr + ret i64 %or +} + +