[MS] Add more 128bit cmpxchg intrinsics for AArch64

The MSVC STL for requires this on ARM64.
Requested in https://llvm.org/pr47099

Depends on D92061

Differential Revision: https://reviews.llvm.org/D92062
This commit is contained in:
Reid Kleckner 2020-11-24 11:50:33 -08:00
parent 3bd0672726
commit 1e843a987d
4 changed files with 163 additions and 62 deletions

View File

@ -153,6 +153,11 @@ TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_acq, "LLiLLiD*LLiLLi", "nh",
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_nf, "LLiLLiD*LLiLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_rel, "LLiLLiD*LLiLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128, "UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_acq,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_nf ,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_rel,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedOr8_acq, "ccD*c", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedOr8_nf, "ccD*c", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(_InterlockedOr8_rel, "ccD*c", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

View File

@ -304,6 +304,10 @@ Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
AtomicOrdering::Monotonic :
SuccessOrdering;
// The atomic instruction is marked volatile for consistency with MSVC. This
// blocks the few atomics optimizations that LLVM has. If we want to optimize
// _Interlocked* operations in the future, we will have to remove the volatile
// marker.
auto *Result = CGF.Builder.CreateAtomicCmpXchg(
Destination, Comparand, Exchange,
SuccessOrdering, FailureOrdering);
@ -311,6 +315,68 @@ Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
return CGF.Builder.CreateExtractValue(Result, 0);
}
// 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
// prototyped like this:
//
// unsigned char _InterlockedCompareExchange128...(
// __int64 volatile * _Destination,
// __int64 _ExchangeHigh,
// __int64 _ExchangeLow,
// __int64 * _ComparandResult);
static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
const CallExpr *E,
AtomicOrdering SuccessOrdering) {
assert(E->getNumArgs() == 4);
llvm::Value *Destination = CGF.EmitScalarExpr(E->getArg(0));
llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
llvm::Value *ComparandPtr = CGF.EmitScalarExpr(E->getArg(3));
assert(Destination->getType()->isPointerTy());
assert(!ExchangeHigh->getType()->isPointerTy());
assert(!ExchangeLow->getType()->isPointerTy());
assert(ComparandPtr->getType()->isPointerTy());
// For Release ordering, the failure ordering should be Monotonic.
auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
? AtomicOrdering::Monotonic
: SuccessOrdering;
// Convert to i128 pointers and values.
llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
Destination = CGF.Builder.CreateBitCast(Destination, Int128PtrTy);
Address ComparandResult(CGF.Builder.CreateBitCast(ComparandPtr, Int128PtrTy),
CGF.getContext().toCharUnitsFromBits(128));
// (((i128)hi) << 64) | ((i128)lo)
ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
ExchangeHigh =
CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
// Load the comparand for the instruction.
llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandResult);
auto *CXI = CGF.Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
SuccessOrdering, FailureOrdering);
// The atomic instruction is marked volatile for consistency with MSVC. This
// blocks the few atomics optimizations that LLVM has. If we want to optimize
// _Interlocked* operations in the future, we will have to remove the volatile
// marker.
CXI->setVolatile(true);
// Store the result as an outparameter.
CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
ComparandResult);
// Get the success boolean and zero extend it to i8.
Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
}
static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
assert(E->getArg(0)->getType()->isPointerType());
@ -993,6 +1059,10 @@ enum class CodeGenFunction::MSVCIntrin {
_InterlockedCompareExchange_acq,
_InterlockedCompareExchange_rel,
_InterlockedCompareExchange_nf,
_InterlockedCompareExchange128,
_InterlockedCompareExchange128_acq,
_InterlockedCompareExchange128_rel,
_InterlockedCompareExchange128_nf,
_InterlockedOr_acq,
_InterlockedOr_rel,
_InterlockedOr_nf,
@ -1230,6 +1300,14 @@ translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
case AArch64::BI_InterlockedCompareExchange_nf:
case AArch64::BI_InterlockedCompareExchange64_nf:
return MSVCIntrin::_InterlockedCompareExchange_nf;
case AArch64::BI_InterlockedCompareExchange128:
return MSVCIntrin::_InterlockedCompareExchange128;
case AArch64::BI_InterlockedCompareExchange128_acq:
return MSVCIntrin::_InterlockedCompareExchange128_acq;
case AArch64::BI_InterlockedCompareExchange128_nf:
return MSVCIntrin::_InterlockedCompareExchange128_nf;
case AArch64::BI_InterlockedCompareExchange128_rel:
return MSVCIntrin::_InterlockedCompareExchange128_rel;
case AArch64::BI_InterlockedOr8_acq:
case AArch64::BI_InterlockedOr16_acq:
case AArch64::BI_InterlockedOr_acq:
@ -1317,6 +1395,8 @@ translateX86ToMsvcIntrin(unsigned BuiltinID) {
return MSVCIntrin::_BitScanReverse;
case clang::X86::BI_InterlockedAnd64:
return MSVCIntrin::_InterlockedAnd;
case clang::X86::BI_InterlockedCompareExchange128:
return MSVCIntrin::_InterlockedCompareExchange128;
case clang::X86::BI_InterlockedExchange64:
return MSVCIntrin::_InterlockedExchange;
case clang::X86::BI_InterlockedExchangeAdd64:
@ -1423,6 +1503,15 @@ Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
case MSVCIntrin::_InterlockedCompareExchange_nf:
return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
case MSVCIntrin::_InterlockedCompareExchange128:
return EmitAtomicCmpXchg128ForMSIntrin(
*this, E, AtomicOrdering::SequentiallyConsistent);
case MSVCIntrin::_InterlockedCompareExchange128_acq:
return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
case MSVCIntrin::_InterlockedCompareExchange128_rel:
return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
case MSVCIntrin::_InterlockedCompareExchange128_nf:
return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
case MSVCIntrin::_InterlockedOr_acq:
return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
AtomicOrdering::Acquire);
@ -14032,42 +14121,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
llvm::SyncScope::SingleThread);
}
case X86::BI_InterlockedCompareExchange128: {
// InterlockedCompareExchange128 doesn't directly refer to 128bit ints,
// instead it takes pointers to 64bit ints for Destination and
// ComparandResult, and exchange is taken as two 64bit ints (high & low).
// The previous value is written to ComparandResult, and success is
// returned.
llvm::Type *Int128Ty = Builder.getInt128Ty();
llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
Value *Destination =
Builder.CreateBitCast(Ops[0], Int128PtrTy);
Value *ExchangeHigh128 = Builder.CreateZExt(Ops[1], Int128Ty);
Value *ExchangeLow128 = Builder.CreateZExt(Ops[2], Int128Ty);
Address ComparandResult(Builder.CreateBitCast(Ops[3], Int128PtrTy),
getContext().toCharUnitsFromBits(128));
Value *Exchange = Builder.CreateOr(
Builder.CreateShl(ExchangeHigh128, 64, "", false, false),
ExchangeLow128);
Value *Comparand = Builder.CreateLoad(ComparandResult);
AtomicCmpXchgInst *CXI =
Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
AtomicOrdering::SequentiallyConsistent,
AtomicOrdering::SequentiallyConsistent);
CXI->setVolatile(true);
// Write the result back to the inout pointer.
Builder.CreateStore(Builder.CreateExtractValue(CXI, 0), ComparandResult);
// Get the success boolean and zero extend it to i8.
Value *Success = Builder.CreateExtractValue(CXI, 1);
return Builder.CreateZExt(Success, ConvertType(E->getType()));
}
case X86::BI_AddressOfReturnAddress: {
Function *F =

View File

@ -214,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
long _Comparand);
unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
__int64 *_CompareandResult);
unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
@ -427,6 +423,26 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand);
#endif
#if defined(__x86_64__) || defined(__aarch64__)
unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
__int64 *_ComparandResult);
#endif
#if defined(__aarch64__)
unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
__int64 *_ComparandResult);
unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
__int64 *_ComparandResult);
unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
__int64 _ExchangeHigh,
__int64 _ExchangeLow,
__int64 *_ComparandResult);
#endif
/*----------------------------------------------------------------------------*\
|* movs, stos

View File

@ -6,10 +6,10 @@
// RUN: | FileCheck %s --check-prefixes CHECK,CHECK-ARM,CHECK-ARM-ARM64,CHECK-ARM-X64
// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
// RUN: -triple x86_64--windows -Oz -emit-llvm -target-feature +cx16 %s -o - \
// RUN: | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL
// RUN: | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL,CHECK-64
// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
// RUN: -triple aarch64-windows -Oz -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefixes CHECK-ARM-ARM64,CHECK-ARM-X64,CHECK-ARM64
// RUN: | FileCheck %s --check-prefixes CHECK-ARM-ARM64,CHECK-ARM-X64,CHECK-ARM64,CHECK-64
// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
// stddef.h. Work around it with this typedef.
@ -432,32 +432,59 @@ __int64 test_InterlockedCompareExchange64(__int64 volatile *Destination, __int64
// CHECK: ret i64 [[RESULT]]
// CHECK: }
#if defined(__x86_64__)
#if defined(__x86_64__) || defined(__aarch64__)
unsigned char test_InterlockedCompareExchange128(
__int64 volatile *Destination, __int64 ExchangeHigh,
__int64 ExchangeLow, __int64 *ComparandResult) {
return _InterlockedCompareExchange128(++Destination, ++ExchangeHigh,
++ExchangeLow, ++ComparandResult);
}
// CHECK-X64: define{{.*}}i8 @test_InterlockedCompareExchange128(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, i64*{{[a-z_ ]*}}%ComparandResult){{.*}}{
// CHECK-X64: %incdec.ptr = getelementptr inbounds i64, i64* %Destination, i64 1
// CHECK-X64: %inc = add nsw i64 %ExchangeHigh, 1
// CHECK-X64: %inc1 = add nsw i64 %ExchangeLow, 1
// CHECK-X64: %incdec.ptr2 = getelementptr inbounds i64, i64* %ComparandResult, i64 1
// CHECK-X64: [[DST:%[0-9]+]] = bitcast i64* %incdec.ptr to i128*
// CHECK-X64: [[EH:%[0-9]+]] = zext i64 %inc to i128
// CHECK-X64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
// CHECK-X64: [[CNR:%[0-9]+]] = bitcast i64* %incdec.ptr2 to i128*
// CHECK-X64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
// CHECK-X64: [[EXP:%[0-9]+]] = or i128 [[EHS]], [[EL]]
// CHECK-X64: [[ORG:%[0-9]+]] = load i128, i128* [[CNR]], align 16
// CHECK-X64: [[RES:%[0-9]+]] = cmpxchg volatile i128* [[DST]], i128 [[ORG]], i128 [[EXP]] seq_cst seq_cst
// CHECK-X64: [[OLD:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 0
// CHECK-X64: store i128 [[OLD]], i128* [[CNR]], align 16
// CHECK-X64: [[SUC1:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 1
// CHECK-X64: [[SUC8:%[0-9]+]] = zext i1 [[SUC1]] to i8
// CHECK-X64: ret i8 [[SUC8]]
// CHECK-X64: }
// CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, i64*{{[a-z_ ]*}}%ComparandResult){{.*}}{
// CHECK-64: %incdec.ptr = getelementptr inbounds i64, i64* %Destination, i64 1
// CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
// CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
// CHECK-64: %incdec.ptr2 = getelementptr inbounds i64, i64* %ComparandResult, i64 1
// CHECK-64: [[DST:%[0-9]+]] = bitcast i64* %incdec.ptr to i128*
// CHECK-64: [[CNR:%[0-9]+]] = bitcast i64* %incdec.ptr2 to i128*
// CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128
// CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
// CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
// CHECK-64: [[EXP:%[0-9]+]] = or i128 [[EHS]], [[EL]]
// CHECK-64: [[ORG:%[0-9]+]] = load i128, i128* [[CNR]], align 16
// CHECK-64: [[RES:%[0-9]+]] = cmpxchg volatile i128* [[DST]], i128 [[ORG]], i128 [[EXP]] seq_cst seq_cst
// CHECK-64: [[OLD:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 0
// CHECK-64: store i128 [[OLD]], i128* [[CNR]], align 16
// CHECK-64: [[SUC1:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 1
// CHECK-64: [[SUC8:%[0-9]+]] = zext i1 [[SUC1]] to i8
// CHECK-64: ret i8 [[SUC8]]
// CHECK-64: }
#endif
#if defined(__aarch64__)
unsigned char test_InterlockedCompareExchange128_acq(
__int64 volatile *Destination, __int64 ExchangeHigh,
__int64 ExchangeLow, __int64 *ComparandResult) {
return _InterlockedCompareExchange128_acq(Destination, ExchangeHigh,
ExchangeLow, ComparandResult);
}
unsigned char test_InterlockedCompareExchange128_nf(
__int64 volatile *Destination, __int64 ExchangeHigh,
__int64 ExchangeLow, __int64 *ComparandResult) {
return _InterlockedCompareExchange128_nf(Destination, ExchangeHigh,
ExchangeLow, ComparandResult);
}
unsigned char test_InterlockedCompareExchange128_rel(
__int64 volatile *Destination, __int64 ExchangeHigh,
__int64 ExchangeLow, __int64 *ComparandResult) {
return _InterlockedCompareExchange128_rel(Destination, ExchangeHigh,
ExchangeLow, ComparandResult);
}
// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_acq({{.*}})
// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} acquire acquire
// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_nf({{.*}})
// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} monotonic monotonic
// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_rel({{.*}})
// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} release monotonic
#endif
short test_InterlockedIncrement16(short volatile *Addend) {