Switch lowering: extract jump tables and bit tests before building binary tree (PR22262)

This is a major rewrite of the SelectionDAG switch lowering. The previous code would lower switches as a binary tre, discovering clusters of cases suitable for lowering by jump tables or bit tests as it went along. To increase the likelihood of finding jump tables, the binary tree pivot was selected to maximize case density on both sides of the pivot. By not selecting the pivot in the middle, the binary trees would not always be balanced, leading to performance problems in the generated code. This patch rewrites the lowering to search for clusters of cases suitable for jump tables or bit tests first, and then builds the binary tree around those clusters. This way, the binary tree will always be balanced. This has the added benefit of decoupling the different aspects of the lowering: tree building and jump table or bit tests finding are now easier to tweak separately. For example, this will enable us to balance the tree based on profile info in the future. The algorithm for finding jump tables is O(n^2), whereas the previous algorithm was O(n log n) for common cases, and quadratic only in the worst-case. This doesn't seem to be major problem in practice, e.g. compiling a file consisting of a 10k-case switch was only 30% slower, and such large switches should be rare in practice. Compiling e.g. gcc.c showed no compile-time difference. If this does turn out to be a problem, we could limit the search space of the algorithm. This commit also disables all optimizations during switch lowering in -O0. Differential Revision: http://reviews.llvm.org/D8649 llvm-svn: 235101
2015-04-16 14:49:23 +00:00 · 2015-04-16 14:49:23 +00:00 · d403664ed8
parent 8997d8d115
commit d403664ed8
12 changed files with 1266 additions and 907 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -134,26 +134,65 @@ private:
  /// SDNodes we create.
  unsigned SDNodeOrder;

-  /// Case - A struct to record the Value for a switch case, and the
-  /// case's target basic block.
-  struct Case {
-    const ConstantInt *Low;
-    const ConstantInt *High;
-    MachineBasicBlock* BB;
-    uint32_t ExtraWeight;
+  enum CaseClusterKind {
+    /// A cluster of adjacent case labels with the same destination, or just one
+    /// case.
+    CC_Range,
+    /// A cluster of cases suitable for jump table lowering.
+    CC_JumpTable,
+    /// A cluster of cases suitable for bit test lowering.
+    CC_BitTests
+  };

-    Case() : Low(nullptr), High(nullptr), BB(nullptr), ExtraWeight(0) { }
-    Case(const ConstantInt *low, const ConstantInt *high, MachineBasicBlock *bb,
-         uint32_t extraweight) : Low(low), High(high), BB(bb),
-         ExtraWeight(extraweight) { }
+  /// A cluster of case labels.
+  struct CaseCluster {
+    CaseClusterKind Kind;
+    const ConstantInt *Low, *High;
+    union {
+      MachineBasicBlock *MBB;
+      unsigned JTCasesIndex;
+      unsigned BTCasesIndex;
+    };
+    uint64_t Weight;

-    APInt size() const {
-      const APInt &rHigh = High->getValue();
-      const APInt &rLow  = Low->getValue();
-      return (rHigh - rLow + 1ULL);
+    static CaseCluster range(const ConstantInt *Low, const ConstantInt *High,
+                             MachineBasicBlock *MBB, uint32_t Weight) {
+      CaseCluster C;
+      C.Kind = CC_Range;
+      C.Low = Low;
+      C.High = High;
+      C.MBB = MBB;
+      C.Weight = Weight;
+      return C;
+    }
+
+    static CaseCluster jumpTable(const ConstantInt *Low,
+                                 const ConstantInt *High, unsigned JTCasesIndex,
+                                 uint32_t Weight) {
+      CaseCluster C;
+      C.Kind = CC_JumpTable;
+      C.Low = Low;
+      C.High = High;
+      C.JTCasesIndex = JTCasesIndex;
+      C.Weight = Weight;
+      return C;
+    }
+
+    static CaseCluster bitTests(const ConstantInt *Low, const ConstantInt *High,
+                                unsigned BTCasesIndex, uint32_t Weight) {
+      CaseCluster C;
+      C.Kind = CC_BitTests;
+      C.Low = Low;
+      C.High = High;
+      C.BTCasesIndex = BTCasesIndex;
+      C.Weight = Weight;
+      return C;
    }
  };

+  typedef std::vector<CaseCluster> CaseClusterVector;
+  typedef CaseClusterVector::iterator CaseClusterIt;
+
  struct CaseBits {
    uint64_t Mask;
    MachineBasicBlock* BB;
@ -163,42 +202,14 @@ private:
    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
             uint32_t Weight):
      Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { }
+
+    CaseBits() : Mask(0), BB(nullptr), Bits(0), ExtraWeight(0) {}
  };

-  typedef std::vector<Case>           CaseVector;
-  typedef std::vector<CaseBits>       CaseBitsVector;
-  typedef CaseVector::iterator        CaseItr;
-  typedef std::pair<CaseItr, CaseItr> CaseRange;
+  typedef std::vector<CaseBits> CaseBitsVector;

-  /// CaseRec - A struct with ctor used in lowering switches to a binary tree
-  /// of conditional branches.
-  struct CaseRec {
-    CaseRec(MachineBasicBlock *bb, const ConstantInt *lt, const ConstantInt *ge,
-            CaseRange r) :
-    CaseBB(bb), LT(lt), GE(ge), Range(r) {}
-
-    /// CaseBB - The MBB in which to emit the compare and branch
-    MachineBasicBlock *CaseBB;
-    /// LT, GE - If nonzero, we know the current case value must be less-than or
-    /// greater-than-or-equal-to these Constants.
-    const ConstantInt *LT;
-    const ConstantInt *GE;
-    /// Range - A pair of iterators representing the range of case values to be
-    /// processed at this point in the binary search tree.
-    CaseRange Range;
-  };
-
-  typedef std::vector<CaseRec> CaseRecVector;
-
-  struct CaseBitsCmp {
-    bool operator()(const CaseBits &C1, const CaseBits &C2) {
-      return C1.Bits > C2.Bits;
-    }
-  };
-
-  /// Populate Cases with the cases in SI, clustering adjacent cases with the
-  /// same destination together.
-  void Clusterify(CaseVector &Cases, const SwitchInst *SI);
+  /// Sort Clusters and merge adjacent cases.
+  void sortAndRangeify(CaseClusterVector &Clusters);

  /// CaseBlock - This structure is used to communicate between
  /// SelectionDAGBuilder and SDISel for the code generation of additional basic
@ -288,6 +299,58 @@ private:
    BitTestInfo Cases;
  };

+  /// Minimum jump table density, in percent.
+  enum { MinJumpTableDensity = 40 };
+
+  /// Check whether a range of clusters is dense enough for a jump table.
+  bool isDense(const CaseClusterVector &Clusters, unsigned *TotalCases,
+               unsigned First, unsigned Last);
+
+  /// Build a jump table cluster from Clusters[First..Last]. Returns false if it
+  /// decides it's not a good idea.
+  bool buildJumpTable(CaseClusterVector &Clusters, unsigned First,
+                      unsigned Last, const SwitchInst *SI,
+                      MachineBasicBlock *DefaultMBB, CaseCluster &JTCluster);
+
+  /// Find clusters of cases suitable for jump table lowering.
+  void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,
+                      MachineBasicBlock *DefaultMBB);
+
+  /// Check whether the range [Low,High] fits in a machine word.
+  bool rangeFitsInWord(const APInt &Low, const APInt &High);
+
+  /// Check whether these clusters are suitable for lowering with bit tests based
+  /// on the number of destinations, comparison metric, and range.
+  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
+                             const APInt &Low, const APInt &High);
+
+  /// Build a bit test cluster from Clusters[First..Last]. Returns false if it
+  /// decides it's not a good idea.
+  bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last,
+                     const SwitchInst *SI, CaseCluster &BTCluster);
+
+  /// Find clusters of cases suitable for bit test lowering.
+  void findBitTestClusters(CaseClusterVector &Clusters, const SwitchInst *SI);
+
+  struct SwitchWorkListItem {
+    MachineBasicBlock *MBB;
+    CaseClusterIt FirstCluster;
+    CaseClusterIt LastCluster;
+    const ConstantInt *GE;
+    const ConstantInt *LT;
+  };
+  typedef SmallVector<SwitchWorkListItem, 4> SwitchWorkList;
+
+  /// Emit comparison and split W into two subtrees.
+  void splitWorkItem(SwitchWorkList &WorkList, const SwitchWorkListItem &W,
+                     Value *Cond, MachineBasicBlock *SwitchMBB);
+
+  /// Lower W.
+  void lowerWorkItem(SwitchWorkListItem W, Value *Cond,
+                     MachineBasicBlock *SwitchMBB,
+                     MachineBasicBlock *DefaultMBB);
+
+
  /// A class which encapsulates all of the information needed to generate a
  /// stack protector check and signals to isel via its state being initialized
  /// that a stack protector needs to be generated.
@ -670,29 +733,6 @@ private:
  void visitIndirectBr(const IndirectBrInst &I);
  void visitUnreachable(const UnreachableInst &I);

-  // Helpers for visitSwitch
-  bool handleSmallSwitchRange(CaseRec& CR,
-                              CaseRecVector& WorkList,
-                              const Value* SV,
-                              MachineBasicBlock* Default,
-                              MachineBasicBlock *SwitchBB);
-  bool handleJTSwitchCase(CaseRec& CR,
-                          CaseRecVector& WorkList,
-                          const Value* SV,
-                          MachineBasicBlock* Default,
-                          MachineBasicBlock *SwitchBB);
-  bool handleBTSplitSwitchCase(CaseRec& CR,
-                               CaseRecVector& WorkList,
-                               const Value* SV,
-                               MachineBasicBlock *SwitchBB);
-  void splitSwitchCase(CaseRec &CR, CaseItr Pivot, CaseRecVector &WorkList,
-                       const Value *SV, MachineBasicBlock *SwitchBB);
-  bool handleBitTestsSwitchCase(CaseRec& CR,
-                                CaseRecVector& WorkList,
-                                const Value* SV,
-                                MachineBasicBlock* Default,
-                                MachineBasicBlock *SwitchBB);
-
  uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                         const MachineBasicBlock *Dst) const;
  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@ -1459,21 +1459,15 @@ SelectionDAGISel::FinishBasicBlock() {
                 << FuncInfo->PHINodesToUpdate[i].first
                 << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");

-  const bool MustUpdatePHINodes = SDB->SwitchCases.empty() &&
-                                  SDB->JTCases.empty() &&
-                                  SDB->BitTestCases.empty();
-
  // Next, now that we know what the last MBB the LLVM BB expanded is, update
  // PHI nodes in successors.
-  if (MustUpdatePHINodes) {
-    for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
-      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
-      assert(PHI->isPHI() &&
-             "This is not a machine PHI node that we are updating!");
-      if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
-        continue;
-      PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
-    }
+  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
+    assert(PHI->isPHI() &&
+           "This is not a machine PHI node that we are updating!");
+    if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
+      continue;
+    PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
  }

  // Handle stack protector.
@ -1518,10 +1512,6 @@ SelectionDAGISel::FinishBasicBlock() {
    SDB->SPDescriptor.resetPerBBState();
  }

-  // If we updated PHI Nodes, return early.
-  if (MustUpdatePHINodes)
-    return;
-
  for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
    // Lower header first, if it wasn't already lowered
    if (!SDB->BitTestCases[i].Emitted) {
@ -1635,16 +1625,6 @@ SelectionDAGISel::FinishBasicBlock() {
  }
  SDB->JTCases.clear();

-  // If the switch block involved a branch to one of the actual successors, we
-  // need to update PHI nodes in that block.
-  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
-    MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
-    assert(PHI->isPHI() &&
-           "This is not a machine PHI node that we are updating!");
-    if (FuncInfo->MBB->isSuccessor(PHI->getParent()))
-      PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
-  }
-
  // If we generated any switch lowering information, build and codegen any
  // additional DAGs necessary.
  for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
--- a/llvm/test/CodeGen/ARM/ifcvt3.ll
+++ b/llvm/test/CodeGen/ARM/ifcvt3.ll
@ -4,8 +4,8 @@

 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: t1:
-; CHECK: cmp r2, #1
-; CHECK: cmpne r2, #7
+; CHECK: cmp r2, #7
+; CHECK: cmpne r2, #1
 	switch i32 %c, label %cond_next [
 		 i32 1, label %cond_true
 		 i32 7, label %cond_true
--- a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
@ -194,7 +194,7 @@ lor.lhs.false459:                                 ; preds = %if.end454
  %18 = load i32, i32* %mb_type, align 4
  switch i32 %18, label %for.inc503 [
    i32 9, label %if.then475
-    i32 10, label %if.then475
+    i32 11, label %if.then475
    i32 13, label %if.then475
    i32 14, label %if.then475
  ]
--- a/llvm/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/llvm/test/CodeGen/Generic/MachineBranchProb.ll
@ -17,9 +17,9 @@ entry:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: Successors according to CFG: BB#1(4) BB#5(10)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+; CHECK: Successors according to CFG: BB#1(10) BB#3(7)

 sw.bb:
  br label %return
--- a/llvm/test/CodeGen/PowerPC/mcm-5.ll
+++ b/llvm/test/CodeGen/PowerPC/mcm-5.ll
@ -1,5 +1,5 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
-; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -code-model=large <%s | FileCheck %s

 ; Test correct code generation for medium and large code model
 ; for loading the address of a jump table from the TOC.
--- a/llvm/test/CodeGen/PowerPC/mcm-obj.ll
+++ b/llvm/test/CodeGen/PowerPC/mcm-obj.ll
@ -3,6 +3,12 @@
 ; RUN: llc -O0 -mcpu=pwr7 -code-model=large -filetype=obj -fast-isel=false %s -o - | \
 ; RUN: llvm-readobj -r | FileCheck -check-prefix=LARGE %s

+; Run jump table test separately since jump tables aren't generated at -O0.
+; RUN: llc -mcpu=pwr7 -code-model=medium -filetype=obj -fast-isel=false %s -o - | \
+; RUN: llvm-readobj -r | FileCheck -check-prefix=MEDIUM-JT %s
+; RUN: llc -mcpu=pwr7 -code-model=large -filetype=obj -fast-isel=false %s -o - | \
+; RUN: llvm-readobj -r | FileCheck -check-prefix=LARGE-JT %s
+
 ; FIXME: When asm-parse is available, could make this an assembly test.

 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
@ -92,6 +98,46 @@ entry:
 ; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM4:[^ ]+]]
 ; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM4]]

+@ti = common global i32 0, align 4
+
+define signext i32 @test_tentative() nounwind {
+entry:
+  %0 = load i32, i32* @ti, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @ti, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing tentatively declared variable ti.
+;
+; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM6:[^ ]+]]
+; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM6]]
+;
+; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM6:[^ ]+]]
+; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM6]]
+
+define i8* @test_fnaddr() nounwind {
+entry:
+  %func = alloca i32 (i32)*, align 8
+  store i32 (i32)* @foo, i32 (i32)** %func, align 8
+  %0 = load i32 (i32)*, i32 (i32)** %func, align 8
+  %1 = bitcast i32 (i32)* %0 to i8*
+  ret i8* %1
+}
+
+declare signext i32 @foo(i32 signext)
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing function address foo.
+;
+; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM7:[^ ]+]]
+; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM7]]
+;
+; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM7:[^ ]+]]
+; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM7]]
+
+
 define signext i32 @test_jump_table(i32 signext %i) nounwind {
 entry:
  %i.addr = alloca i32, align 4
@ -139,47 +185,12 @@ sw.epilog:                                        ; preds = %sw.bb3, %sw.default
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
 ; accessing a jump table address.
 ;
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM5:[^ ]+]]
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM5]]
+; MEDIUM-JT:      Relocations [
+; MEDIUM-JT:        Section (2) .rela.text {
+; MEDIUM-JT-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM:[^ ]+]]
+; MEDIUM-JT-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM]]
 ;
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM5:[^ ]+]]
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM5]]
-
-@ti = common global i32 0, align 4
-
-define signext i32 @test_tentative() nounwind {
-entry:
-  %0 = load i32, i32* @ti, align 4
-  %inc = add nsw i32 %0, 1
-  store i32 %inc, i32* @ti, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
-; accessing tentatively declared variable ti.
-;
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM6:[^ ]+]]
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM6]]
-;
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM6:[^ ]+]]
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM6]]
-
-define i8* @test_fnaddr() nounwind {
-entry:
-  %func = alloca i32 (i32)*, align 8
-  store i32 (i32)* @foo, i32 (i32)** %func, align 8
-  %0 = load i32 (i32)*, i32 (i32)** %func, align 8
-  %1 = bitcast i32 (i32)* %0 to i8*
-  ret i8* %1
-}
-
-declare signext i32 @foo(i32 signext)
-
-; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
-; accessing function address foo.
-;
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM7:[^ ]+]]
-; MEDIUM-NEXT:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM7]]
-;
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM7:[^ ]+]]
-; LARGE-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM7]]
+; LARGE-JT:       Relocations [
+; LARGE-JT:         Section (2) .rela.text {
+; LARGE-JT-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM:[^ ]+]]
+; LARGE-JT-NEXT:      0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM]]
--- a/llvm/test/CodeGen/X86/pic_jumptable.ll
+++ b/llvm/test/CodeGen/X86/pic_jumptable.ll
@ -55,13 +55,15 @@ entry:
 	]

 bb:		; preds = %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry
+	call void @_Z3bari( i32 0 )
 	br label %bb1

 bb1:		; preds = %bb, %entry
+	call void @_Z3bari( i32 1 )
 	br label %bb2

 bb2:		; preds = %bb1, %entry
-	call void @_Z3bari( i32 1 )
+	call void @_Z3bari( i32 2 )
 	br label %bb11

 bb3:		; preds = %entry
--- a/llvm/test/CodeGen/X86/switch-bt.ll
+++ b/llvm/test/CodeGen/X86/switch-bt.ll
@ -140,19 +140,17 @@ sw.epilog:

 ; The balanced binary switch here would start with a comparison against 39, but
 ; it is currently starting with 29 because of the density-sum heuristic.
-; CHECK: cmpl $29
+; CHECK: cmpl $39
 ; CHECK: jg
 ; CHECK: cmpl $10
-; CHECK: jne
-; CHECK: cmpl $49
-; CHECK: jg
-; CHECK: cmpl $30
-; CHECK: jne
+; CHECK: je
 ; CHECK: cmpl $20
 ; CHECK: jne
+; CHECK: cmpl $40
+; CHECK: je
 ; CHECK: cmpl $50
 ; CHECK: jne
-; CHECK: cmpl $40
+; CHECK: cmpl $30
 ; CHECK: jne
 ; CHECK: cmpl $60
 ; CHECK: jne
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@ -0,0 +1,288 @@
+; RUN: llc -march=x86-64 %s -o - | FileCheck %s
+; RUN: llc -march=x86-64 %s -o - -O0 | FileCheck --check-prefix=NOOPT %s
+
+declare void @g(i32)
+
+define void @basic(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 3, label %bb0
+    i32 1, label %bb1
+    i32 4, label %bb1
+    i32 5, label %bb0
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Should be lowered as straight compares in -O0 mode.
+; NOOPT-LABEL: basic
+; NOOPT: subl $3, %eax
+; NOOPT: je
+; NOOPT: subl $1, %eax
+; NOOPT: je
+; NOOPT: subl $4, %eax
+; NOOPT: je
+; NOOPT: subl $5, %eax
+; NOOPT: je
+
+; Jump table otherwise.
+; CHECK-LABEL: basic
+; CHECK: decl
+; CHECK: cmpl $4
+; CHECK: ja
+; CHECK: jmpq *.LJTI
+}
+
+
+define void @simple_ranges(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0, label %bb0
+    i32 1, label %bb0
+    i32 2, label %bb0
+    i32 3, label %bb0
+    i32 100, label %bb1
+    i32 101, label %bb1
+    i32 102, label %bb1
+    i32 103, label %bb1
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Should be lowered to two range checks.
+; CHECK-LABEL: simple_ranges
+; CHECK: leal -100
+; CHECK: cmpl $4
+; CHECK: jae
+; CHECK: cmpl $3
+; CHECK: ja
+}
+
+
+define void @jt_is_better(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0, label %bb0
+    i32 2, label %bb0
+    i32 4, label %bb0
+    i32 1, label %bb1
+    i32 3, label %bb1
+    i32 5, label %bb1
+
+    i32 6, label %bb2
+    i32 7, label %bb3
+    i32 8, label %bb4
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+bb3: tail call void @g(i32 3) br label %return
+bb4: tail call void @g(i32 4) br label %return
+return: ret void
+
+; Cases 0-5 could be lowered with two bit tests,
+; but with 6-8, the whole switch is suitable for a jump table.
+; CHECK-LABEL: jt_is_better
+; CHECK: cmpl $8
+; CHECK: jbe
+; CHECK: jmpq *.LJTI
+}
+
+
+define void @bt_is_better(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0, label %bb0
+    i32 3, label %bb0
+    i32 6, label %bb0
+    i32 1, label %bb1
+    i32 4, label %bb1
+    i32 7, label %bb1
+    i32 2, label %bb2
+    i32 5, label %bb2
+    i32 8, label %bb2
+
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+return: ret void
+
+; This could be lowered as a jump table, but bit tests is more efficient.
+; CHECK-LABEL: bt_is_better
+; 73 = 2^0 + 2^3 + 2^6
+; CHECK: movl $73
+; CHECK: btl
+; 146 = 2^1 + 2^4 + 2^7
+; CHECK: movl $146
+; CHECK: btl
+; 292 = 2^2 + 2^5 + 2^8
+; CHECK: movl $292
+; CHECK: btl
+}
+
+
+define void @optimal_pivot1(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 100, label %bb0
+    i32 200, label %bb1
+    i32 300, label %bb0
+    i32 400, label %bb1
+    i32 500, label %bb0
+    i32 600, label %bb1
+
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Should pivot around 400 for two subtrees of equal size.
+; CHECK-LABEL: optimal_pivot1
+; CHECK-NOT: cmpl
+; CHECK: cmpl $399
+}
+
+
+define void @optimal_pivot2(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 100, label %bb0   i32 101, label %bb1   i32 102, label %bb2   i32 103, label %bb3
+    i32 200, label %bb0   i32 201, label %bb1   i32 202, label %bb2   i32 203, label %bb3
+    i32 300, label %bb0   i32 301, label %bb1   i32 302, label %bb2   i32 303, label %bb3
+    i32 400, label %bb0   i32 401, label %bb1   i32 402, label %bb2   i32 403, label %bb3
+
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+bb3: tail call void @g(i32 3) br label %return
+return: ret void
+
+; Should pivot around 300 for two subtrees with two jump tables each.
+; CHECK-LABEL: optimal_pivot2
+; CHECK-NOT: cmpl
+; CHECK: cmpl $299
+; CHECK: jmpq *.LJTI
+; CHECK: jmpq *.LJTI
+; CHECK: jmpq *.LJTI
+; CHECK: jmpq *.LJTI
+}
+
+
+define void @optimal_jump_table1(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0,  label %bb0
+    i32 5,  label %bb1
+    i32 6,  label %bb2
+    i32 12, label %bb3
+    i32 13, label %bb4
+    i32 15, label %bb5
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+bb3: tail call void @g(i32 3) br label %return
+bb4: tail call void @g(i32 4) br label %return
+bb5: tail call void @g(i32 5) br label %return
+return: ret void
+
+; Splitting in the largest gap (between 6 and 12) would yield suboptimal result.
+; Expecting a jump table from 5 to 15.
+; CHECK-LABEL: optimal_jump_table1
+; CHECK: leal -5
+; CHECK: cmpl $10
+; CHECK: jmpq *.LJTI
+}
+
+
+define void @optimal_jump_table2(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 0,  label %bb0
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 9,  label %bb3
+    i32 14, label %bb4
+    i32 15, label %bb5
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+bb3: tail call void @g(i32 3) br label %return
+bb4: tail call void @g(i32 4) br label %return
+bb5: tail call void @g(i32 5) br label %return
+return: ret void
+
+; Partitioning the cases to the minimum number of dense sets is not good enough.
+; This can be partitioned as {0,1,2,9},{14,15} or {0,1,2},{9,14,15}. The former
+; should be preferred. Expecting a table from 0-9.
+; CHECK-LABEL: optimal_jump_table2
+; CHECK: cmpl $9
+; CHECK: jmpq *.LJTI
+}
+
+
+define void @optimal_jump_table3(i32 %x) {
+entry:
+  switch i32 %x, label %return [
+    i32 1,  label %bb0
+    i32 2,  label %bb1
+    i32 3,  label %bb2
+    i32 10, label %bb3
+    i32 13, label %bb0
+    i32 14, label %bb1
+    i32 15, label %bb2
+    i32 20, label %bb3
+    i32 25, label %bb4
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 2) br label %return
+bb3: tail call void @g(i32 3) br label %return
+bb4: tail call void @g(i32 4) br label %return
+return: ret void
+
+; Splitting to maximize left-right density sum and gap size would split this
+; between 3 and 10, and then between 20 and 25. It's better to build a table
+; from 1-20.
+; CHECK-LABEL: optimal_jump_table3
+; CHECK: leal -1
+; CHECK: cmpl $19
+; CHECK: jmpq *.LJTI
+}
+
+%struct.S = type { %struct.S*, i32 }
+define void @phi_node_trouble(%struct.S* %s) {
+entry:
+  br label %header
+header:
+  %ptr = phi %struct.S* [ %s, %entry ], [ %next, %loop ]
+  %bool = icmp eq %struct.S* %ptr, null
+  br i1 %bool, label %exit, label %loop
+loop:
+  %nextptr = getelementptr inbounds %struct.S, %struct.S* %ptr, i64 0, i32 0
+  %next = load %struct.S*, %struct.S** %nextptr
+  %xptr = getelementptr inbounds %struct.S, %struct.S* %next, i64 0, i32 1
+  %x = load i32, i32* %xptr
+  switch i32 %x, label %exit [
+    i32 4, label %header
+    i32 36, label %exit2
+    i32 69, label %exit2
+    i32 25, label %exit2
+  ]
+exit:
+  ret void
+exit2:
+  ret void
+
+; This will be lowered to a comparison with 4 and then bit tests. Make sure
+; that the phi node in %header gets a value from the comparison block.
+; CHECK-LABEL: phi_node_trouble
+; CHECK: movq (%[[REG1:[a-z]+]]), %[[REG1]]
+; CHECK: movl 8(%[[REG1]]), %[[REG2:[a-z]+]]
+; CHECK: cmpl $4, %[[REG2]]
+}
--- a/llvm/test/MC/ARM/data-in-code.ll
+++ b/llvm/test/MC/ARM/data-in-code.ll
@ -1,8 +1,8 @@
-;; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 \
+;; RUN: llc -verify-machineinstrs \
 ;; RUN:   -mtriple=armv7-linux-gnueabi -filetype=obj %s -o - | \
 ;; RUN:   llvm-readobj -t | FileCheck -check-prefix=ARM %s

-;; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort=1 \
+;; RUN: llc -verify-machineinstrs \
 ;; RUN:   -mtriple=thumbv7-linux-gnueabi -filetype=obj %s -o - | \
 ;; RUN:   llvm-readobj -t | FileCheck -check-prefix=TMB %s

@ -11,102 +11,25 @@

 define void @foo(i32* %ptr) nounwind ssp {
  %tmp = load i32, i32* %ptr, align 4
-  switch i32 %tmp, label %default [
-    i32 11, label %bb0
-    i32 10, label %bb1
-    i32 8, label %bb2
-    i32 4, label %bb3
-    i32 2, label %bb4
-    i32 6, label %bb5
-    i32 9, label %bb6
-    i32 15, label %bb7
-    i32 1, label %bb8
-    i32 3, label %bb9
-    i32 5, label %bb10
-    i32 30, label %bb11
-    i32 31, label %bb12
-    i32 13, label %bb13
-    i32 14, label %bb14
-    i32 20, label %bb15
-    i32 19, label %bb16
-    i32 17, label %bb17
-    i32 18, label %bb18
-    i32 21, label %bb19
-    i32 22, label %bb20
-    i32 16, label %bb21
-    i32 24, label %bb22
-    i32 25, label %bb23
-    i32 26, label %bb24
-    i32 27, label %bb25
-    i32 28, label %bb26
-    i32 23, label %bb27
-    i32 12, label %bb28
+  switch i32 %tmp, label %exit [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
  ]
-
-default:
-  br label %exit
 bb0:
+  store i32 0, i32* %ptr, align 4
  br label %exit
 bb1:
+  store i32 1, i32* %ptr, align 4
  br label %exit
 bb2:
+  store i32 2, i32* %ptr, align 4
  br label %exit
 bb3:
+  store i32 3, i32* %ptr, align 4
  br label %exit
-bb4:
-  br label %exit
-bb5:
-  br label %exit
-bb6:
-  br label %exit
-bb7:
-  br label %exit
-bb8:
-  br label %exit
-bb9:
-  br label %exit
-bb10:
-  br label %exit
-bb11:
-  br label %exit
-bb12:
-  br label %exit
-bb13:
-  br label %exit
-bb14:
-  br label %exit
-bb15:
-  br label %exit
-bb16:
-  br label %exit
-bb17:
-  br label %exit
-bb18:
-  br label %exit
-bb19:
-  br label %exit
-bb20:
-  br label %exit
-bb21:
-  br label %exit
-bb22:
-  br label %exit
-bb23:
-  br label %exit
-bb24:
-  br label %exit
-bb25:
-  br label %exit
-bb26:
-  br label %exit
-bb27:
-  br label %exit
-bb28:
-  br label %exit
-
-
 exit:
-
  ret void
 }