ARM64: add patterns for more lane-wise ld1/st1 operations.

llvm-svn: 205294
2014-04-01 10:37:09 +00:00 · 2014-04-01 10:37:09 +00:00 · ff179ba3d3
parent d8d613b979
commit ff179ba3d3
4 changed files with 268 additions and 139 deletions
--- a/llvm/lib/Target/ARM64/ARM64InstrFormats.td
+++ b/llvm/lib/Target/ARM64/ARM64InstrFormats.td
@ -7971,8 +7971,7 @@ multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
 }
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
 multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
                            (outs listtype:$dst),
                            (ins listtype:$Vt, VectorIndexD:$idx,
@ -7985,12 +7984,10 @@ multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        am_simdnoindex:$vaddr),
-                           pattern>;
+                                        am_simdnoindex:$vaddr), []>;

  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
                            (outs), (ins listtype:$Vt, VectorIndexB:$idx,
@ -7998,12 +7995,10 @@ multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr),
-                        pattern>;
+                                         am_simdnoindex:$vaddr), []>;

  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
@ -8011,12 +8006,10 @@ multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr),
-                            pattern>;
+                                         am_simdnoindex:$vaddr), []>;

  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
@ -8024,11 +8017,10 @@ multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr), pattern>;
+                                         am_simdnoindex:$vaddr), []>;

  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
--- a/llvm/lib/Target/ARM64/ARM64InstrInfo.td
+++ b/llvm/lib/Target/ARM64/ARM64InstrInfo.td
@ -4087,18 +4087,32 @@ def : Pat<(v2f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
 def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
          (LD1Rv1d am_simdnoindex:$vaddr)>;

-def : Pat<(vector_insert (v16i8 VecListOne128:$Rd),
-            (i32 (extloadi8 am_simdnoindex:$vaddr)), VectorIndexB:$idx),
-          (LD1i8 VecListOne128:$Rd, VectorIndexB:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v8i16 VecListOne128:$Rd),
-            (i32 (extloadi16 am_simdnoindex:$vaddr)), VectorIndexH:$idx),
-          (LD1i16 VecListOne128:$Rd, VectorIndexH:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v4i32 VecListOne128:$Rd),
-            (i32 (load am_simdnoindex:$vaddr)), VectorIndexS:$idx),
-          (LD1i32 VecListOne128:$Rd, VectorIndexS:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v2i64 VecListOne128:$Rd),
-            (i64 (load am_simdnoindex:$vaddr)), VectorIndexD:$idx),
-          (LD1i64 VecListOne128:$Rd, VectorIndexD:$idx, am_simdnoindex:$vaddr)>;
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, am_simdnoindex:$vaddr),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;


 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@ -4107,38 +4121,53 @@ defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
 defm LD4 : SIMDLdSt4SingleAliases<"ld4">;

 // Stores
-let AddedComplexity = 8 in {
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb,
-  [(truncstorei8
-      (i32 (vector_extract (v16i8 VecListOneb:$Vt), VectorIndexB:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh,
-  [(truncstorei16
-      (i32 (vector_extract (v8i16 VecListOneh:$Vt), VectorIndexH:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes,
-  [(store
-      (i32 (vector_extract (v4i32 VecListOnes:$Vt), VectorIndexS:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned,
-  [(store
-      (i64 (vector_extract (v2i64 VecListOned:$Vt), VectorIndexD:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi8>;
-}
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 8 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 8 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;

 let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   [], GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   [], GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   [], GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   [], GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, [], GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, [], GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, [], GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, [], GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  [], GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  [], GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  [], GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  [], GPR64pi32>;
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
 }

 defm ST1 : SIMDLdSt1SingleAliases<"st1">;
--- a/llvm/test/CodeGen/ARM64/ld1.ll
+++ b/llvm/test/CodeGen/ARM64/ld1.ll
@ -5,7 +5,7 @@
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }

 define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
-; CHECK: ld2_8b
+; CHECK-LABEL: ld2_8b
 ; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
 ; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK ld2.8b { v0, v1 }, [x0]
@ -15,7 +15,7 @@ define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
 }

 define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
-; CHECK: ld3_8b
+; CHECK-LABEL: ld3_8b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -24,7 +24,7 @@ define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
 }

 define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
-; CHECK: ld4_8b
+; CHECK-LABEL: ld4_8b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -41,7 +41,7 @@ declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind r
 %struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }

 define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
-; CHECK: ld2_16b
+; CHECK-LABEL: ld2_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -50,7 +50,7 @@ define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
 }

 define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
-; CHECK: ld3_16b
+; CHECK-LABEL: ld3_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -59,7 +59,7 @@ define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
 }

 define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
-; CHECK: ld4_16b
+; CHECK-LABEL: ld4_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -76,7 +76,7 @@ declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind
 %struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }

 define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
-; CHECK: ld2_4h
+; CHECK-LABEL: ld2_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -85,7 +85,7 @@ define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
 }

 define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
-; CHECK: ld3_4h
+; CHECK-LABEL: ld3_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -94,7 +94,7 @@ define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
 }

 define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
-; CHECK: ld4_4h
+; CHECK-LABEL: ld4_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -111,7 +111,7 @@ declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwi
 %struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }

 define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
-; CHECK: ld2_8h
+; CHECK-LABEL: ld2_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -120,7 +120,7 @@ define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
 }

 define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
-; CHECK: ld3_8h
+; CHECK-LABEL: ld3_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -129,7 +129,7 @@ define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
 }

 define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
-; CHECK: ld4_8h
+; CHECK-LABEL: ld4_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -146,7 +146,7 @@ declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwi
 %struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }

 define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
-; CHECK: ld2_2s
+; CHECK-LABEL: ld2_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -155,7 +155,7 @@ define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
 }

 define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
-; CHECK: ld3_2s
+; CHECK-LABEL: ld3_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -164,7 +164,7 @@ define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
 }

 define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
-; CHECK: ld4_2s
+; CHECK-LABEL: ld4_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -181,7 +181,7 @@ declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwi
 %struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }

 define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
-; CHECK: ld2_4s
+; CHECK-LABEL: ld2_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -190,7 +190,7 @@ define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
 }

 define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
-; CHECK: ld3_4s
+; CHECK-LABEL: ld3_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -199,7 +199,7 @@ define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
 }

 define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
-; CHECK: ld4_4s
+; CHECK-LABEL: ld4_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -216,7 +216,7 @@ declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwi
 %struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }

 define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
-; CHECK: ld2_2d
+; CHECK-LABEL: ld2_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -225,7 +225,7 @@ define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
 }

 define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
-; CHECK: ld3_2d
+; CHECK-LABEL: ld3_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -234,7 +234,7 @@ define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
 }

 define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
-; CHECK: ld4_2d
+; CHECK-LABEL: ld4_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -252,7 +252,7 @@ declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwi


 define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
-; CHECK: ld2_1di64
+; CHECK-LABEL: ld2_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -261,7 +261,7 @@ define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
 }

 define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
-; CHECK: ld3_1di64
+; CHECK-LABEL: ld3_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -270,7 +270,7 @@ define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
 }

 define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
-; CHECK: ld4_1di64
+; CHECK-LABEL: ld4_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -289,7 +289,7 @@ declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwi


 define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
-; CHECK: ld2_1df64
+; CHECK-LABEL: ld2_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@ -298,7 +298,7 @@ define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
 }

 define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
-; CHECK: ld3_1df64
+; CHECK-LABEL: ld3_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@ -307,7 +307,7 @@ define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
 }

 define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
-; CHECK: ld4_1df64
+; CHECK-LABEL: ld4_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@ -800,7 +800,7 @@ declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounw
 declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly

 define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
-; CHECK: ld1_16b
+; CHECK-LABEL: ld1_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.b { v0 }[0], [x0]
 ; CHECK-NEXT ret
@ -810,7 +810,7 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
 }

 define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
-; CHECK: ld1_8h
+; CHECK-LABEL: ld1_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.h { v0 }[0], [x0]
 ; CHECK-NEXT ret
@ -820,7 +820,7 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
 }

 define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
-; CHECK: ld1_4s
+; CHECK-LABEL: ld1_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.s { v0 }[0], [x0]
 ; CHECK-NEXT ret
@ -829,8 +829,18 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
  ret <4 x i32> %tmp2
 }

+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+  ret <4 x float> %tmp2
+}
+
 define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
-; CHECK: ld1_2d
+; CHECK-LABEL: ld1_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.d { v0 }[0], [x0]
 ; CHECK-NEXT ret
@ -839,8 +849,18 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
  ret <2 x i64> %tmp2
 }

+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+  ret <2 x double> %tmp2
+}
+
 define <1 x i64> @ld1_1d(<1 x i64>* %p) {
-; CHECK: ld1_1d
+; CHECK-LABEL: ld1_1d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr [[REG:d[0-9]+]], [x0]
 ; CHECK-NEXT: ret
@ -848,6 +868,46 @@ define <1 x i64> @ld1_1d(<1 x i64>* %p) {
  ret <1 x i64> %tmp
 }

+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+  ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+  ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+  ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+  ret <2 x float> %tmp2
+}
+

 ; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
 define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
@ -882,7 +942,7 @@ entry:
 ; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
 define <4 x float> @ld1r_4s_float(float* nocapture %x) {
 entry:
-; CHECK: ld1r_4s_float
+; CHECK-LABEL: ld1r_4s_float
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.4s { v0 }, [x0]
 ; CHECK-NEXT ret
@ -896,7 +956,7 @@ entry:

 define <2 x float> @ld1r_2s_float(float* nocapture %x) {
 entry:
-; CHECK: ld1r_2s_float
+; CHECK-LABEL: ld1r_2s_float
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2s { v0 }, [x0]
 ; CHECK-NEXT ret
@ -908,7 +968,7 @@ entry:

 define <2 x double> @ld1r_2d_double(double* nocapture %x) {
 entry:
-; CHECK: ld1r_2d_double
+; CHECK-LABEL: ld1r_2d_double
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2d { v0 }, [x0]
 ; CHECK-NEXT ret
@ -920,7 +980,7 @@ entry:

 define <1 x double> @ld1r_1d_double(double* nocapture %x) {
 entry:
-; CHECK: ld1r_1d_double
+; CHECK-LABEL: ld1r_1d_double
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr d0, [x0]
 ; CHECK-NEXT ret
@ -931,7 +991,7 @@ entry:

 define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
 entry:
-; CHECK: ld1r_4s_float_shuff
+; CHECK-LABEL: ld1r_4s_float_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.4s { v0 }, [x0]
 ; CHECK-NEXT ret
@ -943,7 +1003,7 @@ entry:

 define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
 entry:
-; CHECK: ld1r_2s_float_shuff
+; CHECK-LABEL: ld1r_2s_float_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2s { v0 }, [x0]
 ; CHECK-NEXT ret
@ -955,7 +1015,7 @@ entry:

 define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
 entry:
-; CHECK: ld1r_2d_double_shuff
+; CHECK-LABEL: ld1r_2d_double_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2d { v0 }, [x0]
 ; CHECK-NEXT ret
@ -967,7 +1027,7 @@ entry:

 define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
 entry:
-; CHECK: ld1r_1d_double_shuff
+; CHECK-LABEL: ld1r_1d_double_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr d0, [x0]
 ; CHECK-NEXT ret
--- a/llvm/test/CodeGen/ARM64/st1.ll
+++ b/llvm/test/CodeGen/ARM64/st1.ll
@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s

 define void @st1lane_16b(<16 x i8> %A, i8* %D) {
-; CHECK: st1lane_16b
+; CHECK-LABEL: st1lane_16b
 ; CHECK: st1.b
  %tmp = extractelement <16 x i8> %A, i32 1
  store i8 %tmp, i8* %D
@ -9,7 +9,7 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) {
 }

 define void @st1lane_8h(<8 x i16> %A, i16* %D) {
-; CHECK: st1lane_8h
+; CHECK-LABEL: st1lane_8h
 ; CHECK: st1.h
  %tmp = extractelement <8 x i16> %A, i32 1
  store i16 %tmp, i16* %D
@ -17,44 +17,92 @@ define void @st1lane_8h(<8 x i16> %A, i16* %D) {
 }

 define void @st1lane_4s(<4 x i32> %A, i32* %D) {
-; CHECK: st1lane_4s
+; CHECK-LABEL: st1lane_4s
 ; CHECK: st1.s
  %tmp = extractelement <4 x i32> %A, i32 1
  store i32 %tmp, i32* %D
  ret void
 }

+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
 define void @st1lane_2d(<2 x i64> %A, i64* %D) {
-; CHECK: st1lane_2d
+; CHECK-LABEL: st1lane_2d
 ; CHECK: st1.d
  %tmp = extractelement <2 x i64> %A, i32 1
  store i64 %tmp, i64* %D
  ret void
 }

+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
-; CHECK: st2lane_16b
+; CHECK-LABEL: st2lane_16b
 ; CHECK: st2.b
  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
  ret void
 }

 define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
-; CHECK: st2lane_8h
+; CHECK-LABEL: st2lane_8h
 ; CHECK: st2.h
  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
  ret void
 }

 define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
-; CHECK: st2lane_4s
+; CHECK-LABEL: st2lane_4s
 ; CHECK: st2.s
  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
  ret void
 }

 define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
-; CHECK: st2lane_2d
+; CHECK-LABEL: st2lane_2d
 ; CHECK: st2.d
  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
  ret void
@ -66,28 +114,28 @@ declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32
 declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone

 define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
-; CHECK: st3lane_16b
+; CHECK-LABEL: st3lane_16b
 ; CHECK: st3.b
  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
  ret void
 }

 define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
-; CHECK: st3lane_8h
+; CHECK-LABEL: st3lane_8h
 ; CHECK: st3.h
  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
  ret void
 }

 define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
-; CHECK: st3lane_4s
+; CHECK-LABEL: st3lane_4s
 ; CHECK: st3.s
  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
  ret void
 }

 define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
-; CHECK: st3lane_2d
+; CHECK-LABEL: st3lane_2d
 ; CHECK: st3.d
  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
  ret void
@ -99,28 +147,28 @@ declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32
 declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone

 define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
-; CHECK: st4lane_16b
+; CHECK-LABEL: st4lane_16b
 ; CHECK: st4.b
  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
  ret void
 }

 define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
-; CHECK: st4lane_8h
+; CHECK-LABEL: st4lane_8h
 ; CHECK: st4.h
  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
  ret void
 }

 define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
-; CHECK: st4lane_4s
+; CHECK-LABEL: st4lane_4s
 ; CHECK: st4.s
  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
  ret void
 }

 define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
-; CHECK: st4lane_2d
+; CHECK-LABEL: st4lane_2d
 ; CHECK: st4.d
  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
  ret void
@ -133,21 +181,21 @@ declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64


 define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
-; CHECK: st2_8b
+; CHECK-LABEL: st2_8b
 ; CHECK st2.8b
 	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
 	ret void
 }

 define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
-; CHECK: st3_8b
+; CHECK-LABEL: st3_8b
 ; CHECK st3.8b
 	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
 	ret void
 }

 define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
-; CHECK: st4_8b
+; CHECK-LABEL: st4_8b
 ; CHECK st4.8b
 	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
 	ret void
@ -158,21 +206,21 @@ declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) n
 declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly

 define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
-; CHECK: st2_16b
+; CHECK-LABEL: st2_16b
 ; CHECK st2.16b
 	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
 	ret void
 }

 define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
-; CHECK: st3_16b
+; CHECK-LABEL: st3_16b
 ; CHECK st3.16b
 	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
 	ret void
 }

 define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
-; CHECK: st4_16b
+; CHECK-LABEL: st4_16b
 ; CHECK st4.16b
 	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
 	ret void
@ -183,21 +231,21 @@ declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8
 declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly

 define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
-; CHECK: st2_4h
+; CHECK-LABEL: st2_4h
 ; CHECK st2.4h
 	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
 	ret void
 }

 define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
-; CHECK: st3_4h
+; CHECK-LABEL: st3_4h
 ; CHECK st3.4h
 	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
 	ret void
 }

 define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
-; CHECK: st4_4h
+; CHECK-LABEL: st4_4h
 ; CHECK st4.4h
 	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
 	ret void
@ -208,21 +256,21 @@ declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i
 declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly

 define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
-; CHECK: st2_8h
+; CHECK-LABEL: st2_8h
 ; CHECK st2.8h
 	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
 	ret void
 }

 define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
-; CHECK: st3_8h
+; CHECK-LABEL: st3_8h
 ; CHECK st3.8h
 	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
 	ret void
 }

 define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
-; CHECK: st4_8h
+; CHECK-LABEL: st4_8h
 ; CHECK st4.8h
 	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
 	ret void
@ -233,21 +281,21 @@ declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i
 declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly

 define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
-; CHECK: st2_2s
+; CHECK-LABEL: st2_2s
 ; CHECK st2.2s
 	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
 	ret void
 }

 define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
-; CHECK: st3_2s
+; CHECK-LABEL: st3_2s
 ; CHECK st3.2s
 	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
 	ret void
 }

 define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
-; CHECK: st4_2s
+; CHECK-LABEL: st4_2s
 ; CHECK st4.2s
 	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
 	ret void
@ -258,21 +306,21 @@ declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i
 declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly

 define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
-; CHECK: st2_4s
+; CHECK-LABEL: st2_4s
 ; CHECK st2.4s
 	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
 	ret void
 }

 define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
-; CHECK: st3_4s
+; CHECK-LABEL: st3_4s
 ; CHECK st3.4s
 	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
 	ret void
 }

 define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
-; CHECK: st4_4s
+; CHECK-LABEL: st4_4s
 ; CHECK st4.4s
 	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
 	ret void
@ -283,21 +331,21 @@ declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i
 declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly

 define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
-; CHECK: st2_1d
+; CHECK-LABEL: st2_1d
 ; CHECK st1.2d
 	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
 	ret void
 }

 define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
-; CHECK: st3_1d
+; CHECK-LABEL: st3_1d
 ; CHECK st1.3d
 	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
 	ret void
 }

 define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
-; CHECK: st4_1d
+; CHECK-LABEL: st4_1d
 ; CHECK st1.4d
 	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
 	ret void
@ -308,21 +356,21 @@ declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i
 declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly

 define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
-; CHECK: st2_2d
+; CHECK-LABEL: st2_2d
 ; CHECK st2.2d
 	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
 	ret void
 }

 define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
-; CHECK: st3_2d
+; CHECK-LABEL: st3_2d
 ; CHECK st2.3d
 	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
 	ret void
 }

 define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
-; CHECK: st4_2d
+; CHECK-LABEL: st4_2d
 ; CHECK st2.4d
 	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
 	ret void