From 9e85980658ad017c5c916beab05eba5a71d83d5e Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 17 Jul 2015 23:18:30 +0000
Subject: [PATCH] ARM: Enable MachineScheduler and disable PostRAScheduler for
 swift.

Reapply r242500 now that the swift schedmodel includes LDRLIT.

This is mostly done to disable the PostRAScheduler which optimizes for
instruction latencies which isn't a good fit for out-of-order
architectures. This also allows to leave out the itinerary table in
swift in favor of the SchedModel ones.

This change leads to performance improvements/regressions by as much as
10% in some benchmarks, in fact we loose 0.4% performance over the
llvm-testsuite for reasons that appear to be unknown or out of the
compilers control. rdar://20803802 documents the investigation of
these effects.

While it is probably a good idea to perform the same switch for the
other ARM out-of-order CPUs, I limited this change to swift as I cannot
perform the benchmark verification on the other CPUs.

Differential Revision: http://reviews.llvm.org/D10513

llvm-svn: 242588
---
 llvm/include/llvm/MC/MCSchedule.h       |    3 +
 llvm/lib/Target/ARM/ARMScheduleSwift.td | 1038 -----------------------
 llvm/lib/Target/ARM/ARMSubtarget.cpp    |   11 +
 llvm/lib/Target/ARM/ARMSubtarget.h      |    3 +
 llvm/test/CodeGen/ARM/adv-copy-opt.ll   |   14 +-
 llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll |   16 +-
 llvm/test/CodeGen/ARM/cmpxchg-idioms.ll |    6 +-
 llvm/test/CodeGen/ARM/test-sharedidx.ll |   16 +-
 llvm/test/CodeGen/ARM/vector-load.ll    |    4 +-
 llvm/test/CodeGen/ARM/vector-store.ll   |    6 +-
 10 files changed, 48 insertions(+), 1069 deletions(-)

diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index c09791631056..a8b20570103e 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -206,6 +206,9 @@ struct MCSchedModel {
   /// scheduling class (itinerary class or SchedRW list).
   bool isComplete() const { return CompleteModel; }
 
+  /// Return true if machine supports out of order execution.
+  bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
+
   unsigned getNumProcResourceKinds() const {
     return NumProcResourceKinds;
   }
diff --git a/llvm/lib/Target/ARM/ARMScheduleSwift.td b/llvm/lib/Target/ARM/ARMScheduleSwift.td
index ff82ca419089..6f5740fd1305 100644
--- a/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -37,1050 +37,12 @@ def SW_FDIV : FuncUnit;
 // FIXME: Add preload instruction when it is documented.
 // FIXME: Model non-pipelined nature of FP div / sqrt unit.
 
-def SwiftItineraries : ProcessorItineraries<
-  [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
-  //
-  // Move instructions, unconditional
-  InstrItinData<IIC_iMOVi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2]>,
-  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                 [3]>,
-  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_LS]>],
-                              [5]>,
-  //
-  // MVN instructions
-  InstrItinData<IIC_iMVNi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  //
-  // No operand cycles
-  InstrItinData<IIC_iALUx   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
-  //
-  // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1, 1]>,
-  //
-  // Bitwise Instructions that produce a result
-  InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1]>,
-  InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1, 1, 1]>,
-  //
-  // Unary Instructions that produce a result
-
-  // CLZ, RBIT, etc.
-  InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-
-  // BFC, BFI, UBFX, SBFX
-  InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [2, 1]>,
-
-  //
-  // Zero and sign extension instructions
-  InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1]>,
-  InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1]>,
-  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                            [1, 1, 1, 1]>,
-  //
-  // Compare instructions
-  InstrItinData<IIC_iCMPi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iCMPr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1, 1]>,
-  //
-  // Test instructions
-  InstrItinData<IIC_iTSTi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iTSTr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
-                              [1, 1, 1]>,
-  //
-  // Move instructions, conditional
-  // FIXME: Correctly model the extra input dep on the destination.
-  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1]>,
-  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [1, 1]>,
-  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2]>,
-
-  // Integer multiply pipeline
-  //
-  InstrItinData<IIC_iMUL16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [3, 1, 1]>,
-  InstrItinData<IIC_iMAC16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [3, 1, 1, 1]>,
-  InstrItinData<IIC_iMUL32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  InstrItinData<IIC_iMAC32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  InstrItinData<IIC_iMUL64  , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0], 3>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [5, 5, 1, 1]>,
-  InstrItinData<IIC_iMAC64  , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0], 1>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [5, 6, 1, 1]>,
-  //
-  // Integer divide
-  InstrItinData<IIC_iDIV  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                             InstrStage<1, [SW_ALU0], 0>,
-                             InstrStage<14, [SW_IDIV]>],
-                            [14, 1, 1]>,
-
-  // Integer load pipeline
-  // FIXME: The timings are some rough approximations
-  //
-  // Immediate offset
-  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1]>,
-  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1]>,
-  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_LS], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 4, 1]>,
-  //
-  // Register offset
-  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 1>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [3, 4, 1, 1]>,
-  //
-  // Scaled register offset
-  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS]>],
-                                [5, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS]>],
-                                [5, 1, 1]>,
-  //
-  // Immediate offset with update
-  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1]>,
-  //
-  // Register offset with update
-  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_ALU0], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [3, 1, 1, 1]>,
-  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [3, 4, 1, 1]>,
-  //
-  // Scaled register offset with update
-  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                 InstrStage<1, [SW_LS], 3>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [5, 3, 1, 1]>,
-  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_DIS2], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                                [5, 3, 1, 1]>,
-  //
-  // Load multiple, def is the 5th operand.
-  // FIXME: This assumes 3 to 4 registers.
-  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
-
-  //
-  // Load multiple + update, defs are the 1st and 5th operands.
-  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
-                                InstrStage<1, [SW_LS], 3>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [2, 1, 1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Load multiple plus branch
-  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Pop, def is the 3rd operand.
-  InstrItinData<IIC_iPop  ,    [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 3], [], -1>, // dynamic uops
-  //
-  // Pop + branch, def is the 3rd operand.
-  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [1, 1, 3], [], -1>, // dynamic uops
-
-  //
-  // iLoadi + iALUr for t2LDRpci_pic.
-  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                InstrStage<1, [SW_LS], 3>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [4, 1]>,
-
-  // Integer store pipeline
-  ///
-  // Immediate offset
-  InstrItinData<IIC_iStore_i  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1]>,
-  //
-  // Register offset
-  InstrItinData<IIC_iStore_r  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
-                                 InstrStage<1, [SW_DIS1], 0>,
-                                 InstrStage<1, [SW_DIS2], 0>,
-                                 InstrStage<1, [SW_LS], 0>,
-                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                 InstrStage<1, [SW_LS]>],
-                                [1, 1, 1]>,
-  //
-  // Scaled register offset
-  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  //
-  // Immediate offset with update
-  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1]>,
-  //
-  // Register offset with update
-  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
-                                  InstrStage<1, [SW_DIS1], 0>,
-                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                  InstrStage<1, [SW_LS]>],
-                                 [1, 1, 1, 1]>,
-  //
-  // Scaled register offset with update
-  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [SW_DIS0], 0>,
-                                    InstrStage<1, [SW_DIS1], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                    InstrStage<1, [SW_LS], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
-                                   [3, 1, 1, 1]>,
-  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
-                                    InstrStage<1, [SW_DIS1], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
-                                    InstrStage<1, [SW_LS], 0>,
-                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
-                                   [3, 1, 1, 1]>,
-  //
-  // Store multiple
-  InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                                [], [], -1>, // dynamic uops
-  //
-  // Store multiple + update
-  InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_DIS2], 0>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS], 1>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
-                                InstrStage<1, [SW_LS]>],
-                               [2], [], -1>, // dynamic uops
-
-  //
-  // Preload
-  InstrItinData<IIC_Preload,   [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
-
-  // Branch
-  //
-  // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br       , [InstrStage<1, [SW_DIS0], 0>]>,
-
-  // FP Special Register to Integer Register File Move
-  InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                              InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                             [1]>,
-  //
-  // Single-precision FP Unary
-  //
-  // Most floating-point moves get issued on ALU0.
-  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Double-precision FP Unary
-  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-
-  //
-  // Single-precision FP Compare
-  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [1, 1]>,
-  //
-  // Double-precision FP Compare
-  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [1, 1]>,
-  //
-  // Single to Double FP Convert
-  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Double to Single FP Convert
-  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-
-  //
-  // Single to Half FP Convert
-  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 4>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1]>,
-  //
-  // Half to Single FP Convert
-  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-
-  //
-  // Single-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Double-Precision FP to Integer Convert
-  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Integer to Single-Precision FP Convert
-  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Integer to Double-Precision FP Convert
-  InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1]>,
-  //
-  // Single-precision FP ALU
-  InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-precision FP ALU
-  InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Single-precision FP Multiply
-  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-precision FP Multiply
-  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 1]>,
-  //
-  // Single-precision FP MAC
-  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-precision FP MAC
-  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [12, 1, 1]>,
-  //
-  // Single-precision Fused FP MAC
-  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-precision Fused FP MAC
-  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [12, 1, 1]>,
-  //
-  // Single-precision FP DIV
-  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<15, [SW_FDIV]>],
-                              [17, 1, 1]>,
-  //
-  // Double-precision FP DIV
-  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<30, [SW_FDIV]>],
-                              [32, 1, 1]>,
-  //
-  // Single-precision FP SQRT
-  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<15, [SW_FDIV]>],
-                              [17, 1]>,
-  //
-  // Double-precision FP SQRT
-  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 0>,
-                               InstrStage<30, [SW_FDIV]>],
-                              [32, 1, 1]>,
-
-  //
-  // Integer to Single-precision Move
-  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-  //
-  // Integer to Double-precision Move
-  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // Single-precision to Integer Move
-  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 1]>,
-  //
-  // Double-precision to Integer Move
-  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 3>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 4, 1]>,
-  //
-  // Single-precision FP Load
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // Double-precision FP Load
-  InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1]>,
-  //
-  // FP Load Multiple
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1, 1, 4], [], -1>, // dynamic uops
-  //
-  // FP Load Multiple + update
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                              [2, 1, 1, 1, 4], [], -1>, // dynamic uops
-  //
-  // Single-precision FP Store
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1]>,
-  //
-  // Double-precision FP Store
-  InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1]>,
-  //
-  // FP Store Multiple
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [1, 1, 1], [], -1>, // dynamic uops
-  //
-  // FP Store Multiple + update
-  // FIXME: Assumes a single Q register.
-  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
-                                InstrStage<1, [SW_DIS1], 0>,
-                                InstrStage<1, [SW_LS], 4>,
-                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
-                               [2, 1, 1, 1], [], -1>, // dynamic uops
-  // NEON
-  //
-  // Double-register Integer Unary
-  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Quad-register Integer Unary
-  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Double-register Integer Q-Unary
-  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Quad-register Integer CountQ-Unary
-  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1]>,
-  //
-  // Double-register Integer Binary
-  InstrItinData<IIC_VBINiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Binary
-  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Subtract
-  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Subtract
-  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Shift
-  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Shift
-  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Shift (4 cycle)
-  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Binary (4 cycle)
-  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Subtract (4 cycle)
-  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Count
-  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register Integer Count
-  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1, 1]>,
-  //
-  // Double-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Absolute Difference and Accumulate
-  InstrItinData<IIC_VABAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Double-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Pair Add Long
-  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Multiply (.8, .16)
-  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-
-  //
-  // Double-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register Integer Multiply (.32)
-  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Double-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Integer Multiply-Accumulate (.8, .16)
-  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-  //
-  // Quad-register Integer Multiply-Accumulate (.32)
-  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1, 1]>,
-
-  //
-  // Move
-  InstrItinData<IIC_VMOV,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Move Immediate
-  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2]>,
-  //
-  // Double-register Permute Move
-  InstrItinData<IIC_VMOVD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Quad-register Permute Move
-  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Integer to Single-precision Move
-  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-  //
-  // Integer to Double-precision Move
-  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [4, 1, 1]>,
-  //
-  // Single-precision to Integer Move
-  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 1]>,
-  //
-  // Double-precision to Integer Move
-  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 3>,
-                               InstrStage<1, [SW_LS]>],
-                              [3, 4, 1]>,
-  //
-  // Integer to Lane Move
-  // FIXME: I think this is correct, but it is not clear from the tuning guide.
-  InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_LS], 4>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [6, 1]>,
-
-  //
-  // Vector narrow move
-  InstrItinData<IIC_VMOVN,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1]>,
-  //
-  // Double-register FP Unary
-  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
-  //        and they issue on a different pipeline.
-  InstrItinData<IIC_VUNAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Quad-register FP Unary
-  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
-  //        and they issue on a different pipeline.
-  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [2, 1]>,
-  //
-  // Double-register FP Binary
-  // FIXME: We're using this itin for many instructions.
-  InstrItinData<IIC_VBIND,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-
-  //
-  // VPADD, etc.
-  InstrItinData<IIC_VPBIND,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register FP VMUL
-  InstrItinData<IIC_VFMULD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register FP Binary
-  InstrItinData<IIC_VBINQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU0]>],
-                              [4, 1, 1]>,
-  //
-  // Quad-register FP VMUL
-  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 1]>,
-  //
-  // Double-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register FP Multiple-Accumulate
-  InstrItinData<IIC_VMACQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Fused FP Multiple-Accumulate
-  InstrItinData<IIC_VFMACD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register FusedF P Multiple-Accumulate
-  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Reciprical Step
-  InstrItinData<IIC_VRECSD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Quad-register Reciprical Step
-  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 1]>,
-  //
-  // Double-register Permute
-  // FIXME: The latencies are unclear from the documentation.
-  InstrItinData<IIC_VPERMD,   [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-  //
-  // Quad-register Permute
-  // FIXME: The latencies are unclear from the documentation.
-  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-  //
-  // Quad-register Permute (3 cycle issue on A9)
-  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [3, 4, 3, 4]>,
-
-  //
-  // Double-register VEXT
-  InstrItinData<IIC_VEXTD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  //
-  // Quad-register VEXT
-  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  //
-  // VTB
-  InstrItinData<IIC_VTB1,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_VTB2,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 3, 3]>,
-  InstrItinData<IIC_VTB3,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 3, 5, 5]>,
-  InstrItinData<IIC_VTB4,     [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 3, 5, 7, 7]>,
-  //
-  // VTBX
-  InstrItinData<IIC_VTBX1,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [2, 1, 1]>,
-  InstrItinData<IIC_VTBX2,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [4, 1, 3, 3]>,
-  InstrItinData<IIC_VTBX3,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [6, 1, 3, 5, 5]>,
-  InstrItinData<IIC_VTBX4,    [InstrStage<1, [SW_DIS0], 0>,
-                               InstrStage<1, [SW_DIS1], 0>,
-                               InstrStage<1, [SW_DIS2], 0>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1], 2>,
-                               InstrStage<1, [SW_ALU1]>],
-                              [8, 1, 3, 5, 7, 7]>
-]>;
-
-// ===---------------------------------------------------------------------===//
-// This following definitions describe the simple machine model which
-// will replace itineraries.
-
 // Swift machine model for scheduling and other instruction cost heuristics.
 def SwiftModel : SchedMachineModel {
   let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
   let MicroOpBufferSize = 45; // Based on NEON renamed registers.
   let LoadLatency = 3;
   let MispredictPenalty = 14; // A branch direction mispredict.
-
-  let Itineraries = SwiftItineraries;
 }
 
 // Swift predicates.
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 31b65f2bfec4..3180480986d6 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -319,8 +319,19 @@ bool ARMSubtarget::hasSinCos() const {
   return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
 }
 
+bool ARMSubtarget::enableMachineScheduler() const {
+  // Enable the MachineScheduler before register allocation for out-of-order
+  // architectures where we do not use the PostRA scheduler anymore (for now
+  // restricted to swift).
+  return getSchedModel().isOutOfOrder() && isSwift();
+}
+
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
+  // No need for PostRA scheduling on out of order CPUs (for now restricted to
+  // swift).
+  if (getSchedModel().isOutOfOrder() && isSwift())
+    return false;
   return (!isThumb() || hasThumb2());
 }
 
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 75425890a283..4f9bc372e4b1 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -433,6 +433,9 @@ public:
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// Returns true if machine scheduler should be enabled.
+  bool enableMachineScheduler() const override;
+
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
diff --git a/llvm/test/CodeGen/ARM/adv-copy-opt.ll b/llvm/test/CodeGen/ARM/adv-copy-opt.ll
index f71bf78b62c4..395be3457203 100644
--- a/llvm/test/CodeGen/ARM/adv-copy-opt.ll
+++ b/llvm/test/CodeGen/ARM/adv-copy-opt.ll
@@ -11,25 +11,25 @@
 ; r0 = r0 / r2
 ; r1 = r1 / r3
 ;
-; NOOPT: vmov	[[B:d[0-9]+]], r2, r3
-; NOOPT-NEXT: vmov	[[A:d[0-9]+]], r0, r1
+; NOOPT: vmov	[[A:d[0-9]+]], r0, r1
+; NOOPT-NEXT: vmov	[[B:d[0-9]+]], r2, r3
 ; Move the low part of B into a register.
 ; Unfortunately, we cannot express that the 's' register is the low
 ; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
 ; NOOPT-NEXT: vmov	[[B_LOW:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
 ; NOOPT-NEXT: vmov	[[B_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
 ; NOOPT-NEXT: vmov	[[A_HIGH:r[0-9]+]], s{{[0-9]+}}
-; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
+; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
 ; NOOPT-NEXT: vmov.32	[[RES:d[0-9]+]][0], [[RES_LOW]]
+; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
 ; NOOPT-NEXT: vmov.32	[[RES]][1], [[RES_HIGH]]
 ; NOOPT-NEXT: vmov	r0, r1, [[RES]]
 ; NOOPT-NEXT: bx	lr
 ;
 ; OPT-NOT: vmov
-; OPT: 	udiv	r0, r0, r2
-; OPT-NEXT: udiv	r1, r1, r3
+; OPT: udiv	r1, r1, r3
+; OPT-NEXT: 	udiv	r0, r0, r2
 ; OPT-NEXT: bx	lr
 define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index c3de07e03b6b..79e8e68e2f57 100644
--- a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
@@ -7,8 +7,10 @@
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
  entry:
 ; CHECK-LABEL: t1:
-; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
-; CHECK-NEXT: mul  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
+; CHECK-CORTEX-NEXT: mul  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-SWIFT: muls  [[REG2:(r[0-9]+)]], r1, r0
+; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
 ; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
   %0 = mul nsw i32 %a, %b
   %1 = mul nsw i32 %c, %d
@@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
 define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-  %tobool7 = icmp eq i32* %ptr2, null
-  br i1 %tobool7, label %while.end, label %while.body
+  br label %while.body
 
 while.body:
 ; CHECK: while.body
@@ -55,8 +56,7 @@ while.end:
 define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
 entry:
 ; CHECK-LABEL: t3:
-  %tobool7 = icmp eq i32* %ptr2, null
-  br i1 %tobool7, label %while.end, label %while.body
+  br label %while.body
 
 while.body:
 ; CHECK: while.body
diff --git a/llvm/test/CodeGen/ARM/cmpxchg-idioms.ll b/llvm/test/CodeGen/ARM/cmpxchg-idioms.ll
index fb88575cab3b..81e05acfef79 100644
--- a/llvm/test/CodeGen/ARM/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/ARM/cmpxchg-idioms.ll
@@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
 ; CHECK: bne [[LOOP]]
 
 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: movs r0, #1
 ; CHECK: dmb ish
+; CHECK: movs r0, #1
 ; CHECK: bx lr
 
 ; CHECK: [[FAILED]]:
 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: movs r0, #0
 ; CHECK: dmb ish
+; CHECK: movs r0, #0
 ; CHECK: bx lr
 
   %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
@@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
 define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
 ; CHECK-LABEL: test_return_bool:
 
-; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
 ; CHECK: dmb ishst
+; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
 
 ; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
diff --git a/llvm/test/CodeGen/ARM/test-sharedidx.ll b/llvm/test/CodeGen/ARM/test-sharedidx.ll
index 377996c4c3c8..77d0f30485df 100644
--- a/llvm/test/CodeGen/ARM/test-sharedidx.ll
+++ b/llvm/test/CodeGen/ARM/test-sharedidx.ll
@@ -20,8 +20,8 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body.3
 ; CHECK: %for.body
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
   %0 = load i8, i8* %arrayidx, align 1
@@ -42,8 +42,8 @@ for.end:                                          ; preds = %for.body, %for.body
 
 for.body.1:                                       ; preds = %for.body
 ; CHECK: %for.body.1
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
   %2 = load i8, i8* %arrayidx.1, align 1
   %conv6.1 = zext i8 %2 to i32
@@ -60,8 +60,8 @@ for.body.1:                                       ; preds = %for.body
 
 for.body.2:                                       ; preds = %for.body.1
 ; CHECK: %for.body.2
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
   %4 = load i8, i8* %arrayidx.2, align 1
   %conv6.2 = zext i8 %4 to i32
@@ -78,8 +78,8 @@ for.body.2:                                       ; preds = %for.body.1
 
 for.body.3:                                       ; preds = %for.body.2
 ; CHECK: %for.body.3
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
-; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
+; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
   %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
   %6 = load i8, i8* %arrayidx.3, align 1
   %conv6.3 = zext i8 %6 to i32
diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll
index 17f134f458a2..a638c2bdb9be 100644
--- a/llvm/test/CodeGen/ARM/vector-load.ll
+++ b/llvm/test/CodeGen/ARM/vector-load.ll
@@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
 
 define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
 ;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
-;CHECK: ldr.w   r[[PTRREG:[0-9]+]], [r0]
+;CHECK: ldr   r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
 ;CHECK: add.w   r[[INCREG:[0-9]+]], r[[PTRREG]], #16
-;CHECK: str.w   r[[INCREG]], [r0]
 ;CHECK: vmovl.u8        {{q[0-9]+}}, {{d[0-9]+}}
 ;CHECK: vmovl.u16       {{q[0-9]+}}, {{d[0-9]+}}
+;CHECK: str   r[[INCREG]], [r0]
 	%A = load <4 x i8>*, <4 x i8>** %ptr
 	%lA = load <4 x i8>, <4 x i8>* %A, align 4
 	%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4
diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll
index 30baa9a20ddc..161bbf1d0fde 100644
--- a/llvm/test/CodeGen/ARM/vector-store.ll
+++ b/llvm/test/CodeGen/ARM/vector-store.ll
@@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
 ;CHECK: ldr.w   r9, [sp]
 ;CHECK: vmov    {{d[0-9]+}}, r3, r9
 ;CHECK: vmov    {{d[0-9]+}}, r1, r2
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
 ;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
-;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
 	%A = load <4 x i8>*, <4 x i8>** %ptr
         %trunc = trunc <4 x i32> %val to <4 x i8>
@@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
 ;CHECK: ldr.w   r9, [sp]
 ;CHECK: vmov    {{d[0-9]+}}, r3, r9
 ;CHECK: vmov    {{d[0-9]+}}, r1, r2
-;CHECK: movs    [[IMM16:r[0-9]+]], #16
+;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
 ;CHECK: vmovn.i32       [[VECLO:d[0-9]+]], {{q[0-9]+}}
 ;CHECK: vuzp.8  [[VECLO]], {{d[0-9]+}}
-;CHECK: ldr     r[[PTRREG:[0-9]+]], [r0]
+;CHECK: movs    [[IMM16:r[0-9]+]], #16
 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
 ;CHECK: str     r[[PTRREG]], [r0]
 	%A = load <4 x i8>*, <4 x i8>** %ptr