[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics
This patch adjusts the following ARM/AArch64 LLVM IR intrinsics: - neon_bfmmla - neon_bfmlalb - neon_bfmlalt so that they take and return bf16 and float types. Previously these intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from implementation lacking bf16 IR type). The neon_vbfdot[q] intrinsics are adjusted similarly. This change required some additional selection patterns for vbfdot itself and also for vector shuffles (in a previous patch) because of SelectionDAG transformations kicking in and mangling the original code. This patch makes the generated IR cleaner (less useless bitcasts are produced), but it does not affect the final assembly. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D86146
This commit is contained in:
parent
ba852e1e19
commit
ae1396c7d4
|
@ -6241,28 +6241,10 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
|
|||
case NEON::BI__builtin_neon_vbfdot_v:
|
||||
case NEON::BI__builtin_neon_vbfdotq_v: {
|
||||
llvm::Type *InputTy =
|
||||
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
|
||||
llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
|
||||
llvm::Type *Tys[2] = { Ty, InputTy };
|
||||
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
|
||||
}
|
||||
case NEON::BI__builtin_neon_vbfmmlaq_v: {
|
||||
llvm::Type *InputTy =
|
||||
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
|
||||
llvm::Type *Tys[2] = { Ty, InputTy };
|
||||
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla");
|
||||
}
|
||||
case NEON::BI__builtin_neon_vbfmlalbq_v: {
|
||||
llvm::Type *InputTy =
|
||||
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
|
||||
llvm::Type *Tys[2] = { Ty, InputTy };
|
||||
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb");
|
||||
}
|
||||
case NEON::BI__builtin_neon_vbfmlaltq_v: {
|
||||
llvm::Type *InputTy =
|
||||
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
|
||||
llvm::Type *Tys[2] = { Ty, InputTy };
|
||||
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
|
||||
}
|
||||
case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
|
||||
llvm::Type *Tys[1] = { Ty };
|
||||
Function *F = CGM.getIntrinsic(Int, Tys);
|
||||
|
|
|
@ -1,146 +1,138 @@
|
|||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
|
||||
// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
// CHECK-LABEL: test_vbfdot_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||
// CHECK-NEXT ret <2 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdot_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
|
||||
return vbfdot_f32(r, a, b);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfdotq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdotq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
|
||||
return vbfdotq_f32(r, a, b);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfdot_lane_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
// CHECK-NEXT ret <2 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdot_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
|
||||
return vbfdot_lane_f32(r, a, b, 0);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfdotq_laneq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
// CHECK-NEXT ret <4 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdotq_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfdotq_laneq_f32(r, a, b, 3);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfdot_laneq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
// CHECK-NEXT ret <2 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdot_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
|
||||
return vbfdot_laneq_f32(r, a, b, 3);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfdotq_lane_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
|
||||
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
// CHECK-NEXT ret <4 x float> %vbfdot1.i
|
||||
// CHECK-LABEL: @test_vbfdotq_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfdotq_lane_f32(r, a, b, 0);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmmlaq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmmla1.i
|
||||
// CHECK-LABEL: @test_vbfmmlaq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmmlaq_f32(r, a, b);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlalbq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
|
||||
// CHECK-LABEL: @test_vbfmlalbq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlalbq_f32(r, a, b);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlaltq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
|
||||
// CHECK-LABEL: @test_vbfmlaltq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlaltq_f32(r, a, b);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlalbq_lane_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
|
||||
// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfmlalbq_lane_f32(r, a, b, 0);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlalbq_laneq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
|
||||
// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlalbq_laneq_f32(r, a, b, 3);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlaltq_lane_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
|
||||
// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfmlaltq_lane_f32(r, a, b, 0);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_vbfmlaltq_laneq_f32
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
|
||||
// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlaltq_laneq_f32(r, a, b, 3);
|
||||
}
|
||||
|
|
|
@ -12,10 +12,8 @@
|
|||
|
||||
// CHECK-LABEL: @test_vbfdot_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
|
||||
return vbfdot_f32(r, a, b);
|
||||
|
@ -23,10 +21,8 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
|
|||
|
||||
// CHECK-LABEL: @test_vbfdotq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
|
||||
return vbfdotq_f32(r, a, b);
|
||||
|
@ -36,10 +32,9 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
|
|||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
|
||||
return vbfdot_lane_f32(r, a, b, 0);
|
||||
|
@ -49,10 +44,9 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
|
|||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfdotq_laneq_f32(r, a, b, 3);
|
||||
|
@ -62,10 +56,9 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
|
|||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
|
||||
return vbfdot_laneq_f32(r, a, b, 3);
|
||||
|
@ -75,10 +68,9 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
|
|||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
|
||||
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
|
||||
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
|
||||
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
|
||||
//
|
||||
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfdotq_lane_f32(r, a, b, 0);
|
||||
|
@ -86,10 +78,8 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
|
|||
|
||||
// CHECK-LABEL: @test_vbfmmlaq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]]
|
||||
// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmmlaq_f32(r, a, b);
|
||||
|
@ -97,10 +87,8 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
|||
|
||||
// CHECK-LABEL: @test_vbfmlalbq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlalbq_f32(r, a, b);
|
||||
|
@ -108,10 +96,8 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
|||
|
||||
// CHECK-LABEL: @test_vbfmlaltq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlaltq_f32(r, a, b);
|
||||
|
@ -120,10 +106,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
|||
// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfmlalbq_lane_f32(r, a, b, 0);
|
||||
|
@ -132,10 +116,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
|
|||
// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlalbq_laneq_f32(r, a, b, 3);
|
||||
|
@ -144,10 +126,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
|
|||
// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
|
||||
return vbfmlaltq_lane_f32(r, a, b, 0);
|
||||
|
@ -156,10 +136,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
|
|||
// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
|
||||
// CHECK-NEXT: entry:
|
||||
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
|
||||
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
|
||||
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
|
||||
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
|
||||
//
|
||||
float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
|
||||
return vbfmlaltq_laneq_f32(r, a, b, 3);
|
||||
|
|
|
@ -184,6 +184,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
|
|||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
||||
[IntrNoMem]>;
|
||||
|
||||
class AdvSIMD_BF16FML_Intrinsic
|
||||
: Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||
[IntrNoMem]>;
|
||||
}
|
||||
|
||||
// Arithmetic ops
|
||||
|
@ -466,9 +470,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
|
|||
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
|
||||
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
|
||||
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
|
||||
def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
|
||||
def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
|
||||
def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
|
||||
def int_aarch64_neon_bfmmla
|
||||
: Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
|
||||
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
|
||||
|
||||
|
||||
// v8.6-A Bfloat Intrinsics
|
||||
|
|
|
@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf
|
|||
: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
|
||||
|
||||
def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
|
||||
def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
|
||||
def int_arm_neon_bfmmla
|
||||
: Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||
[IntrNoMem]>;
|
||||
|
||||
class Neon_FML_Intrinsic
|
||||
: Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
|
||||
[IntrNoMem]>;
|
||||
def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
|
||||
def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
|
||||
class Neon_BF16FML_Intrinsic
|
||||
: Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
|
||||
def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;
|
||||
|
||||
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
|
||||
|
|
|
@ -632,6 +632,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
|
||||
// respectively
|
||||
if ((Name.startswith("arm.neon.bfdot.") ||
|
||||
Name.startswith("aarch64.neon.bfdot.")) &&
|
||||
Name.endswith("i8")) {
|
||||
Intrinsic::ID IID =
|
||||
StringSwitch<Intrinsic::ID>(Name)
|
||||
.Cases("arm.neon.bfdot.v2f32.v8i8",
|
||||
"arm.neon.bfdot.v4f32.v16i8",
|
||||
Intrinsic::arm_neon_bfdot)
|
||||
.Cases("aarch64.neon.bfdot.v2f32.v8i8",
|
||||
"aarch64.neon.bfdot.v4f32.v16i8",
|
||||
Intrinsic::aarch64_neon_bfdot)
|
||||
.Default(Intrinsic::not_intrinsic);
|
||||
if (IID == Intrinsic::not_intrinsic)
|
||||
break;
|
||||
|
||||
size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
|
||||
assert((OperandWidth == 64 || OperandWidth == 128) &&
|
||||
"Unexpected operand width");
|
||||
LLVMContext &Ctx = F->getParent()->getContext();
|
||||
std::array<Type *, 2> Tys {{
|
||||
F->getReturnType(),
|
||||
FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
|
||||
}};
|
||||
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
|
||||
// and accept v8bf16 instead of v16i8
|
||||
if ((Name.startswith("arm.neon.bfm") ||
|
||||
Name.startswith("aarch64.neon.bfm")) &&
|
||||
Name.endswith(".v4f32.v16i8")) {
|
||||
Intrinsic::ID IID =
|
||||
StringSwitch<Intrinsic::ID>(Name)
|
||||
.Case("arm.neon.bfmmla.v4f32.v16i8",
|
||||
Intrinsic::arm_neon_bfmmla)
|
||||
.Case("arm.neon.bfmlalb.v4f32.v16i8",
|
||||
Intrinsic::arm_neon_bfmlalb)
|
||||
.Case("arm.neon.bfmlalt.v4f32.v16i8",
|
||||
Intrinsic::arm_neon_bfmlalt)
|
||||
.Case("aarch64.neon.bfmmla.v4f32.v16i8",
|
||||
Intrinsic::aarch64_neon_bfmmla)
|
||||
.Case("aarch64.neon.bfmlalb.v4f32.v16i8",
|
||||
Intrinsic::aarch64_neon_bfmlalb)
|
||||
.Case("aarch64.neon.bfmlalt.v4f32.v16i8",
|
||||
Intrinsic::aarch64_neon_bfmlalt)
|
||||
.Default(Intrinsic::not_intrinsic);
|
||||
if (IID == Intrinsic::not_intrinsic)
|
||||
break;
|
||||
|
||||
std::array<Type *, 0> Tys;
|
||||
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -3618,6 +3675,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::arm_neon_bfdot:
|
||||
case Intrinsic::arm_neon_bfmmla:
|
||||
case Intrinsic::arm_neon_bfmlalb:
|
||||
case Intrinsic::arm_neon_bfmlalt:
|
||||
case Intrinsic::aarch64_neon_bfdot:
|
||||
case Intrinsic::aarch64_neon_bfmmla:
|
||||
case Intrinsic::aarch64_neon_bfmlalb:
|
||||
case Intrinsic::aarch64_neon_bfmlalt: {
|
||||
SmallVector<Value *, 3> Args;
|
||||
assert(CI->getNumArgOperands() == 3 &&
|
||||
"Mismatch between function args and call args");
|
||||
size_t OperandWidth =
|
||||
CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
|
||||
assert((OperandWidth == 64 || OperandWidth == 128) &&
|
||||
"Unexpected operand width");
|
||||
Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
|
||||
auto Iter = CI->arg_operands().begin();
|
||||
Args.push_back(*Iter++);
|
||||
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
|
||||
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
|
||||
NewCall = Builder.CreateCall(NewFn, Args);
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::bitreverse:
|
||||
NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
|
||||
break;
|
||||
|
|
|
@ -7841,9 +7841,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
|
|||
|
||||
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
|
||||
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
|
||||
v2f32, v8i8>;
|
||||
v2f32, v4bf16>;
|
||||
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
|
||||
v4f32, v16i8>;
|
||||
v4f32, v8bf16>;
|
||||
}
|
||||
|
||||
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
||||
|
@ -7861,7 +7861,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
|||
(InputType RegType:$Rn),
|
||||
(InputType (bitconvert (AccumType
|
||||
(AArch64duplane32 (v4f32 V128:$Rm),
|
||||
VectorIndexH:$idx)))))))]> {
|
||||
VectorIndexS:$idx)))))))]> {
|
||||
|
||||
bits<2> idx;
|
||||
let Inst{21} = idx{0}; // L
|
||||
|
@ -7871,16 +7871,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
|
|||
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
|
||||
|
||||
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
|
||||
".2h", V64, v2f32, v8i8>;
|
||||
".2h", V64, v2f32, v4bf16>;
|
||||
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
|
||||
".2h", V128, v4f32, v16i8>;
|
||||
".2h", V128, v4f32, v8bf16>;
|
||||
}
|
||||
|
||||
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
|
||||
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
|
||||
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
|
||||
(v16i8 V128:$Rn),
|
||||
(v16i8 V128:$Rm)))]> {
|
||||
(v8bf16 V128:$Rn),
|
||||
(v8bf16 V128:$Rm)))]> {
|
||||
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
|
||||
}
|
||||
|
||||
|
@ -7890,10 +7890,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
|
|||
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
|
||||
[(set (v4f32 V128:$dst),
|
||||
(v4f32 (OpNode (v4f32 V128:$Rd),
|
||||
(v16i8 V128:$Rn),
|
||||
(v16i8 (bitconvert (v8bf16
|
||||
(v8bf16 V128:$Rn),
|
||||
(v8bf16
|
||||
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
|
||||
VectorIndexH:$idx)))))))]>,
|
||||
VectorIndexH:$idx)))))]>,
|
||||
Sched<[WriteV]> {
|
||||
bits<5> Rd;
|
||||
bits<5> Rn;
|
||||
|
@ -7917,8 +7917,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
|
|||
V128, asm, ".4s",
|
||||
[(set (v4f32 V128:$dst),
|
||||
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
|
||||
(v16i8 V128:$Rn),
|
||||
(v16i8 V128:$Rm)))]> {
|
||||
(v8bf16 V128:$Rn),
|
||||
(v8bf16 V128:$Rm)))]> {
|
||||
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
|
||||
", $Rm", ".8h", "}");
|
||||
}
|
||||
|
|
|
@ -798,6 +798,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
|
|||
def BFCVTN : SIMD_BFCVTN;
|
||||
def BFCVTN2 : SIMD_BFCVTN2;
|
||||
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
|
||||
|
||||
// Vector-scalar BFDOT:
|
||||
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
|
||||
// register (the instruction uses a single 32-bit lane from it), so the pattern
|
||||
// is a bit tricky.
|
||||
def : Pat<(v2f32 (int_aarch64_neon_bfdot
|
||||
(v2f32 V64:$Rd), (v4bf16 V64:$Rn),
|
||||
(v4bf16 (bitconvert
|
||||
(v2i32 (AArch64duplane32
|
||||
(v4i32 (bitconvert
|
||||
(v8bf16 (insert_subvector undef,
|
||||
(v4bf16 V64:$Rm),
|
||||
(i64 0))))),
|
||||
VectorIndexS:$idx)))))),
|
||||
(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
|
||||
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
|
||||
VectorIndexS:$idx)>;
|
||||
}
|
||||
|
||||
// ARMv8.6A AArch64 matrix multiplication
|
||||
|
|
|
@ -9079,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
|
|||
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
|
||||
}
|
||||
|
||||
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
|
||||
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
|
||||
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
|
||||
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
|
||||
|
||||
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
|
||||
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
|
||||
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
|
||||
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
|
||||
|
||||
class BF16MM<bit Q, RegisterClass RegTy,
|
||||
string opc>
|
||||
|
@ -9091,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
|
|||
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
|
||||
N3RegFrm, IIC_VDOTPROD, "", "",
|
||||
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
|
||||
(v16i8 QPR:$Vn),
|
||||
(v16i8 QPR:$Vm)))]> {
|
||||
(v8bf16 QPR:$Vn),
|
||||
(v8bf16 QPR:$Vm)))]> {
|
||||
let Constraints = "$dst = $Vd";
|
||||
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
|
||||
let DecoderNamespace = "VFPV8";
|
||||
|
@ -9106,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
|
|||
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
|
||||
[(set (v4f32 QPR:$dst),
|
||||
(OpNode (v4f32 QPR:$Vd),
|
||||
(v16i8 QPR:$Vn),
|
||||
(v16i8 QPR:$Vm)))]> {
|
||||
(v8bf16 QPR:$Vn),
|
||||
(v8bf16 QPR:$Vm)))]> {
|
||||
let Constraints = "$dst = $Vd";
|
||||
let DecoderNamespace = "VFPV8";
|
||||
}
|
||||
|
@ -9128,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
|
|||
|
||||
def : Pat<
|
||||
(v4f32 (OpNode (v4f32 QPR:$Vd),
|
||||
(v16i8 QPR:$Vn),
|
||||
(v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
|
||||
VectorIndex16:$lane)))))),
|
||||
(v8bf16 QPR:$Vn),
|
||||
(v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
|
||||
VectorIndex16:$lane)))),
|
||||
(!cast<Instruction>(NAME) QPR:$Vd,
|
||||
QPR:$Vn,
|
||||
(EXTRACT_SUBREG QPR:$Vm,
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
; RUN: llvm-dis < %s.bc | FileCheck %s
|
||||
|
||||
; Bitcode was generated from file below
|
||||
|
||||
define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfdot_f32
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
|
||||
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
|
||||
%vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfdotq_f32
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
%vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmmlaq_f32
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmmla1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
||||
; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
|
@ -0,0 +1,76 @@
|
|||
; RUN: llvm-dis < %s.bc | FileCheck %s
|
||||
|
||||
; Bitcode was generated from file below
|
||||
|
||||
define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfdot_f32
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
|
||||
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfdotq_f32
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmmlaq_f32
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmmla1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
|
||||
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
|
||||
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
||||
; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
|
@ -7,10 +7,8 @@ define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat
|
|||
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -19,24 +17,22 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
|
|||
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: test_vbfdot_lane_f32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: bfdot v0.2s, v1.4h, v2.2h[0]
|
||||
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -45,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
|
|||
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -59,26 +54,25 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
|
|||
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
||||
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: test_vbfdotq_lane_f32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: bfdot v0.4s, v1.8h, v2.2h[0]
|
||||
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
||||
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
|
||||
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -87,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
|
|||
; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmmla1.i
|
||||
%vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmmlaq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -99,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -111,23 +101,20 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: test_vbfmlalbq_lane_f32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0]
|
||||
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -137,23 +124,20 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: test_vbfmlaltq_lane_f32:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0]
|
||||
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -163,14 +147,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
|
||||
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
|
||||
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
|
|
|
@ -7,10 +7,8 @@ define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat>
|
|||
; CHECK-NEXT: vdot.bf16 d0, d1, d2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%1 = bitcast <4 x bfloat> %b to <8 x i8>
|
||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -19,10 +17,8 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
|
|||
; CHECK-NEXT: vdot.bf16 q0, q1, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
|
@ -31,12 +27,11 @@ define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x b
|
|||
; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -46,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
|
|||
; CHECK-NEXT: vdot.bf16 q0, q1, q8
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
||||
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -60,12 +54,11 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
|
|||
; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
%1 = bitcast <4 x bfloat> %a to <8 x i8>
|
||||
%2 = bitcast <2 x float> %shuffle to <8 x i8>
|
||||
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
|
||||
ret <2 x float> %vbfdot1.i
|
||||
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
||||
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
||||
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
||||
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
||||
ret <2 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
|
@ -75,12 +68,11 @@ define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x
|
|||
; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
|
||||
%1 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%2 = bitcast <4 x float> %shuffle to <16 x i8>
|
||||
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
|
||||
ret <4 x float> %vbfdot1.i
|
||||
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
||||
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
|
||||
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
||||
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
||||
ret <4 x float> %vbfdot3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -89,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
|
|||
; CHECK-NEXT: vmmla.bf16 q0, q1, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmmla1.i
|
||||
%vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmmlaq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -101,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||
; CHECK-NEXT: vfmab.bf16 q0, q1, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -113,10 +101,8 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
|
|||
; CHECK-NEXT: vfmat.bf16 q0, q1, q2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %b to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
|
@ -127,10 +113,8 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
|
|||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -140,10 +124,8 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalb1.i
|
||||
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlalbq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
||||
|
@ -154,10 +136,8 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
|
|||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -167,10 +147,8 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
|
|||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlaltq_v3.i
|
||||
}
|
||||
|
||||
define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
|
@ -181,14 +159,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a,
|
|||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
|
||||
%0 = bitcast <8 x bfloat> %a to <16 x i8>
|
||||
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
|
||||
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
||||
ret <4 x float> %vbfmlalt1.i
|
||||
}
|
||||
|
||||
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
|
||||
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
|
||||
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
||||
|
|
Loading…
Reference in New Issue