[ARM][BFloat16] Change types of some Arm and AArch64 bf16 intrinsics

This patch adjusts the following ARM/AArch64 LLVM IR intrinsics:
- neon_bfmmla
- neon_bfmlalb
- neon_bfmlalt
so that they take and return bf16 and float types. Previously these
intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from
implementation lacking bf16 IR type).

The neon_vbfdot[q] intrinsics are adjusted similarly. This change
required some additional selection patterns for vbfdot itself and
also for vector shuffles (in a previous patch) because of SelectionDAG
transformations kicking in and mangling the original code.

This patch makes the generated IR cleaner (less useless bitcasts are
produced), but it does not affect the final assembly.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D86146
This commit is contained in:
Mikhail Maltsev 2020-08-27 18:43:16 +01:00
parent ba852e1e19
commit ae1396c7d4
15 changed files with 501 additions and 331 deletions

View File

@ -6241,28 +6241,10 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
case NEON::BI__builtin_neon_vbfdot_v:
case NEON::BI__builtin_neon_vbfdotq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
}
case NEON::BI__builtin_neon_vbfmmlaq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla");
}
case NEON::BI__builtin_neon_vbfmlalbq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb");
}
case NEON::BI__builtin_neon_vbfmlaltq_v: {
llvm::Type *InputTy =
llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
llvm::Type *Tys[2] = { Ty, InputTy };
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
}
case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
llvm::Type *Tys[1] = { Ty };
Function *F = CGM.getIntrinsic(Int, Tys);

View File

@ -1,146 +1,138 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s
#include <arm_neon.h>
// CHECK-LABEL: test_vbfdot_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8>
// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
// CHECK-NEXT ret <2 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdot_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
return vbfdot_f32(r, a, b);
}
// CHECK-LABEL: test_vbfdotq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdotq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
return vbfdotq_f32(r, a, b);
}
// CHECK-LABEL: test_vbfdot_lane_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
// CHECK-NEXT ret <2 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdot_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
return vbfdot_lane_f32(r, a, b, 0);
}
// CHECK-LABEL: test_vbfdotq_laneq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
// CHECK-NEXT ret <4 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdotq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfdotq_laneq_f32(r, a, b, 3);
}
// CHECK-LABEL: test_vbfdot_laneq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float>
// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8>
// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
// CHECK-NEXT ret <2 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdot_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
return vbfdot_laneq_f32(r, a, b, 3);
}
// CHECK-LABEL: test_vbfdotq_lane_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float>
// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8>
// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
// CHECK-NEXT ret <4 x float> %vbfdot1.i
// CHECK-LABEL: @test_vbfdotq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfdotq_lane_f32(r, a, b, 0);
}
// CHECK-LABEL: test_vbfmmlaq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmmla1.i
// CHECK-LABEL: @test_vbfmmlaq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
//
float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmmlaq_f32(r, a, b);
}
// CHECK-LABEL: test_vbfmlalbq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
// CHECK-LABEL: @test_vbfmlalbq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlalbq_f32(r, a, b);
}
// CHECK-LABEL: test_vbfmlaltq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8>
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
// CHECK-LABEL: @test_vbfmlaltq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlaltq_f32(r, a, b);
}
// CHECK-LABEL: test_vbfmlalbq_lane_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfmlalbq_lane_f32(r, a, b, 0);
}
// CHECK-LABEL: test_vbfmlalbq_laneq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalb1.i
// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlalbq_laneq_f32(r, a, b, 3);
}
// CHECK-LABEL: test_vbfmlaltq_lane_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfmlaltq_lane_f32(r, a, b, 0);
}
// CHECK-LABEL: test_vbfmlaltq_laneq_f32
// CHECK-NEXT: entry:
// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8>
// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
// CHECK-NEXT ret <4 x float> %vbfmlalt1.i
// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlaltq_laneq_f32(r, a, b, 3);
}

View File

@ -12,10 +12,8 @@
// CHECK-LABEL: @test_vbfdot_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
return vbfdot_f32(r, a, b);
@ -23,10 +21,8 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
// CHECK-LABEL: @test_vbfdotq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
return vbfdotq_f32(r, a, b);
@ -36,10 +32,9 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
return vbfdot_lane_f32(r, a, b, 0);
@ -49,10 +44,9 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfdotq_laneq_f32(r, a, b, 3);
@ -62,10 +56,9 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
return vbfdot_laneq_f32(r, a, b, 3);
@ -75,10 +68,9 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]]
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfdotq_lane_f32(r, a, b, 0);
@ -86,10 +78,8 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
// CHECK-LABEL: @test_vbfmmlaq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]]
// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
//
float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmmlaq_f32(r, a, b);
@ -97,10 +87,8 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-LABEL: @test_vbfmlalbq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlalbq_f32(r, a, b);
@ -108,10 +96,8 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-LABEL: @test_vbfmlaltq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlaltq_f32(r, a, b);
@ -120,10 +106,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfmlalbq_lane_f32(r, a, b, 0);
@ -132,10 +116,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]]
// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlalbq_laneq_f32(r, a, b, 3);
@ -144,10 +126,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
return vbfmlaltq_lane_f32(r, a, b, 0);
@ -156,10 +136,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]]
// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
return vbfmlaltq_laneq_f32(r, a, b, 3);

View File

@ -184,6 +184,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
class AdvSIMD_BF16FML_Intrinsic
: Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
}
// Arithmetic ops
@ -466,9 +470,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
def int_aarch64_neon_bfmmla
: Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
// v8.6-A Bfloat Intrinsics

View File

@ -791,14 +791,17 @@ def int_arm_neon_vcvtbfp2bf
: Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
def int_arm_neon_bfmmla
: Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
class Neon_FML_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
class Neon_BF16FML_Intrinsic
: Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;
def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;

View File

@ -632,6 +632,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
return true;
}
}
// Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
// respectively
if ((Name.startswith("arm.neon.bfdot.") ||
Name.startswith("aarch64.neon.bfdot.")) &&
Name.endswith("i8")) {
Intrinsic::ID IID =
StringSwitch<Intrinsic::ID>(Name)
.Cases("arm.neon.bfdot.v2f32.v8i8",
"arm.neon.bfdot.v4f32.v16i8",
Intrinsic::arm_neon_bfdot)
.Cases("aarch64.neon.bfdot.v2f32.v8i8",
"aarch64.neon.bfdot.v4f32.v16i8",
Intrinsic::aarch64_neon_bfdot)
.Default(Intrinsic::not_intrinsic);
if (IID == Intrinsic::not_intrinsic)
break;
size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
assert((OperandWidth == 64 || OperandWidth == 128) &&
"Unexpected operand width");
LLVMContext &Ctx = F->getParent()->getContext();
std::array<Type *, 2> Tys {{
F->getReturnType(),
FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
}};
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
return true;
}
// Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
// and accept v8bf16 instead of v16i8
if ((Name.startswith("arm.neon.bfm") ||
Name.startswith("aarch64.neon.bfm")) &&
Name.endswith(".v4f32.v16i8")) {
Intrinsic::ID IID =
StringSwitch<Intrinsic::ID>(Name)
.Case("arm.neon.bfmmla.v4f32.v16i8",
Intrinsic::arm_neon_bfmmla)
.Case("arm.neon.bfmlalb.v4f32.v16i8",
Intrinsic::arm_neon_bfmlalb)
.Case("arm.neon.bfmlalt.v4f32.v16i8",
Intrinsic::arm_neon_bfmlalt)
.Case("aarch64.neon.bfmmla.v4f32.v16i8",
Intrinsic::aarch64_neon_bfmmla)
.Case("aarch64.neon.bfmlalb.v4f32.v16i8",
Intrinsic::aarch64_neon_bfmlalb)
.Case("aarch64.neon.bfmlalt.v4f32.v16i8",
Intrinsic::aarch64_neon_bfmlalt)
.Default(Intrinsic::not_intrinsic);
if (IID == Intrinsic::not_intrinsic)
break;
std::array<Type *, 0> Tys;
NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
return true;
}
break;
}
@ -3618,6 +3675,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
break;
}
case Intrinsic::arm_neon_bfdot:
case Intrinsic::arm_neon_bfmmla:
case Intrinsic::arm_neon_bfmlalb:
case Intrinsic::arm_neon_bfmlalt:
case Intrinsic::aarch64_neon_bfdot:
case Intrinsic::aarch64_neon_bfmmla:
case Intrinsic::aarch64_neon_bfmlalb:
case Intrinsic::aarch64_neon_bfmlalt: {
SmallVector<Value *, 3> Args;
assert(CI->getNumArgOperands() == 3 &&
"Mismatch between function args and call args");
size_t OperandWidth =
CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
assert((OperandWidth == 64 || OperandWidth == 128) &&
"Unexpected operand width");
Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
auto Iter = CI->arg_operands().begin();
Args.push_back(*Iter++);
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
NewCall = Builder.CreateCall(NewFn, Args);
break;
}
case Intrinsic::bitreverse:
NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
break;

View File

@ -7841,9 +7841,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
v2f32, v8i8>;
v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
v4f32, v16i8>;
v4f32, v8bf16>;
}
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
@ -7861,7 +7861,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
(InputType RegType:$Rn),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4f32 V128:$Rm),
VectorIndexH:$idx)))))))]> {
VectorIndexS:$idx)))))))]> {
bits<2> idx;
let Inst{21} = idx{0}; // L
@ -7871,16 +7871,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
".2h", V64, v2f32, v8i8>;
".2h", V64, v2f32, v4bf16>;
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
".2h", V128, v4f32, v16i8>;
".2h", V128, v4f32, v8bf16>;
}
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
(v8bf16 V128:$Rn),
(v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
}
@ -7890,10 +7890,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
[(set (v4f32 V128:$dst),
(v4f32 (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 (bitconvert (v8bf16
(v8bf16 V128:$Rn),
(v8bf16
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
VectorIndexH:$idx)))))))]>,
VectorIndexH:$idx)))))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
@ -7917,8 +7917,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
V128, asm, ".4s",
[(set (v4f32 V128:$dst),
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
(v8bf16 V128:$Rn),
(v8bf16 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
", $Rm", ".8h", "}");
}

View File

@ -798,6 +798,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
// register (the instruction uses a single 32-bit lane from it), so the pattern
// is a bit tricky.
def : Pat<(v2f32 (int_aarch64_neon_bfdot
(v2f32 V64:$Rd), (v4bf16 V64:$Rn),
(v4bf16 (bitconvert
(v2i32 (AArch64duplane32
(v4i32 (bitconvert
(v8bf16 (insert_subvector undef,
(v4bf16 V64:$Rm),
(i64 0))))),
VectorIndexS:$idx)))))),
(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
VectorIndexS:$idx)>;
}
// ARMv8.6A AArch64 matrix multiplication

View File

@ -9079,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
(!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
}
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
class BF16MM<bit Q, RegisterClass RegTy,
string opc>
@ -9091,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy,
(outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
N3RegFrm, IIC_VDOTPROD, "", "",
[(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),
(v16i8 QPR:$Vm)))]> {
(v8bf16 QPR:$Vn),
(v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
let DecoderNamespace = "VFPV8";
@ -9106,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
[(set (v4f32 QPR:$dst),
(OpNode (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),
(v16i8 QPR:$Vm)))]> {
(v8bf16 QPR:$Vn),
(v8bf16 QPR:$Vm)))]> {
let Constraints = "$dst = $Vd";
let DecoderNamespace = "VFPV8";
}
@ -9128,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
def : Pat<
(v4f32 (OpNode (v4f32 QPR:$Vd),
(v16i8 QPR:$Vn),
(v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
VectorIndex16:$lane)))))),
(v8bf16 QPR:$Vn),
(v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
VectorIndex16:$lane)))),
(!cast<Instruction>(NAME) QPR:$Vd,
QPR:$Vn,
(EXTRACT_SUBREG QPR:$Vm,

View File

@ -0,0 +1,76 @@
; RUN: llvm-dis < %s.bc | FileCheck %s
; Bitcode was generated from file below
define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdot_f32
entry:
%0 = bitcast <4 x bfloat> %a to <8 x i8>
%1 = bitcast <4 x bfloat> %b to <8 x i8>
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
%vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
ret <2 x float> %vbfdot1.i
}
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdotq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
%vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfdot1.i
}
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmmlaq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmmla1.i
}
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalb1.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalt1.i
}
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)

View File

@ -0,0 +1,76 @@
; RUN: llvm-dis < %s.bc | FileCheck %s
; Bitcode was generated from file below
define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdot_f32
entry:
%0 = bitcast <4 x bfloat> %a to <8 x i8>
%1 = bitcast <4 x bfloat> %b to <8 x i8>
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
ret <2 x float> %vbfdot1.i
}
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfdotq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfdot1.i
}
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmmlaq_f32
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmmla1.i
}
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalb1.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
ret <4 x float> %vbfmlalt1.i
}
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)

View File

View File

@ -7,10 +7,8 @@ define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x bfloat> %a to <8 x i8>
%1 = bitcast <4 x bfloat> %b to <8 x i8>
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
ret <2 x float> %vbfdot1.i
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -19,24 +17,22 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfdot1.i
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfdot3.i
}
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: test_vbfdot_lane_f32:
; CHECK: // %bb.0: // %entry
; CHECK: bfdot v0.2s, v1.4h, v2.2h[0]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x bfloat> %b to <2 x float>
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
%1 = bitcast <4 x bfloat> %a to <8 x i8>
%2 = bitcast <2 x float> %shuffle to <8 x i8>
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
ret <2 x float> %vbfdot1.i
%.cast = bitcast <4 x bfloat> %b to <2 x float>
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -45,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %b to <4 x float>
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%1 = bitcast <8 x bfloat> %a to <16 x i8>
%2 = bitcast <4 x float> %shuffle to <16 x i8>
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
ret <4 x float> %vbfdot1.i
%.cast = bitcast <8 x bfloat> %b to <4 x float>
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
ret <4 x float> %vbfdot3.i
}
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@ -59,26 +54,25 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3]
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %b to <4 x float>
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%1 = bitcast <4 x bfloat> %a to <8 x i8>
%2 = bitcast <2 x float> %shuffle to <8 x i8>
%vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
ret <2 x float> %vbfdot1.i
%.cast = bitcast <8 x bfloat> %b to <4 x float>
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
%vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: test_vbfdotq_lane_f32:
; CHECK: // %bb.0: // %entry
; CHECK: bfdot v0.4s, v1.8h, v2.2h[0]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x bfloat> %b to <2 x float>
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
%1 = bitcast <8 x bfloat> %a to <16 x i8>
%2 = bitcast <4 x float> %shuffle to <16 x i8>
%vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
ret <4 x float> %vbfdot1.i
%.cast = bitcast <4 x bfloat> %b to <2 x float>
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
%vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
ret <4 x float> %vbfdot3.i
}
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -87,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmmla1.i
%vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmmlaq_v3.i
}
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -99,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -111,23 +101,20 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmlaltq_v3.i
}
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: test_vbfmlalbq_lane_f32:
; CHECK: // %bb.0: // %entry
; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -137,23 +124,20 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-NEXT: ret
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: test_vbfmlaltq_lane_f32:
; CHECK: // %bb.0: // %entry
; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0]
; CHECK-NEXT: ret
entry:
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlaltq_v3.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -163,14 +147,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-NEXT: ret
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlaltq_v3.i
}
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)

View File

@ -7,10 +7,8 @@ define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat>
; CHECK-NEXT: vdot.bf16 d0, d1, d2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x bfloat> %a to <8 x i8>
%1 = bitcast <4 x bfloat> %b to <8 x i8>
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
ret <2 x float> %vbfdot1.i
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -19,10 +17,8 @@ define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloa
; CHECK-NEXT: vdot.bf16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfdot1.i
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
ret <4 x float> %vbfdot3.i
}
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
@ -31,12 +27,11 @@ define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x b
; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x bfloat> %b to <2 x float>
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
%1 = bitcast <4 x bfloat> %a to <8 x i8>
%2 = bitcast <2 x float> %shuffle to <8 x i8>
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
ret <2 x float> %vbfdot1.i
%.cast = bitcast <4 x bfloat> %b to <2 x float>
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -46,12 +41,11 @@ define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x
; CHECK-NEXT: vdot.bf16 q0, q1, q8
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %b to <4 x float>
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%1 = bitcast <8 x bfloat> %a to <16 x i8>
%2 = bitcast <4 x float> %shuffle to <16 x i8>
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
ret <4 x float> %vbfdot1.i
%.cast = bitcast <8 x bfloat> %b to <4 x float>
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
ret <4 x float> %vbfdot3.i
}
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@ -60,12 +54,11 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x
; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %b to <4 x float>
%shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%1 = bitcast <4 x bfloat> %a to <8 x i8>
%2 = bitcast <2 x float> %shuffle to <8 x i8>
%vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
ret <2 x float> %vbfdot1.i
%.cast = bitcast <8 x bfloat> %b to <4 x float>
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
ret <2 x float> %vbfdot3.i
}
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -75,12 +68,11 @@ define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x
; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <4 x bfloat> %b to <2 x float>
%shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
%1 = bitcast <8 x bfloat> %a to <16 x i8>
%2 = bitcast <4 x float> %shuffle to <16 x i8>
%vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
ret <4 x float> %vbfdot1.i
%.cast = bitcast <4 x bfloat> %b to <2 x float>
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
ret <4 x float> %vbfdot3.i
}
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -89,10 +81,8 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
; CHECK-NEXT: vmmla.bf16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmmla1.i
%vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmmlaq_v3.i
}
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -101,10 +91,8 @@ define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-NEXT: vfmab.bf16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -113,10 +101,8 @@ define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfl
; CHECK-NEXT: vfmat.bf16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %b to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
ret <4 x float> %vbfmlaltq_v3.i
}
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -127,10 +113,8 @@ define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
; CHECK-NEXT: bx lr
entry:
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -140,10 +124,8 @@ define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-NEXT: bx lr
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalb1.i
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlalbq_v3.i
}
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@ -154,10 +136,8 @@ define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4
; CHECK-NEXT: bx lr
entry:
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlaltq_v3.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -167,10 +147,8 @@ define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8
; CHECK-NEXT: bx lr
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
ret <4 x float> %vbfmlalt1.i
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlaltq_v3.i
}
define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@ -181,14 +159,12 @@ define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a,
; CHECK-NEXT: bx lr
entry:
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
%0 = bitcast <8 x bfloat> %a to <16 x i8>
%1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
ret <4 x float> %vbfmlalt1.i
}
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)