[MS] Add more 128bit cmpxchg intrinsics for AArch64

The MSVC STL for requires this on ARM64. Requested in https://llvm.org/pr47099 Depends on D92061 Differential Revision: https://reviews.llvm.org/D92062
2020-11-24 11:50:33 -08:00 · 2020-11-24 11:50:33 -08:00 · 1e843a987d
parent 3bd0672726
commit 1e843a987d
4 changed files with 163 additions and 62 deletions
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@ -153,6 +153,11 @@ TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_acq, "LLiLLiD*LLiLLi", "nh",
 TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_nf,  "LLiLLiD*LLiLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedCompareExchange64_rel, "LLiLLiD*LLiLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

+TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128,    "UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_acq,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_nf ,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128_rel,"UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+
 TARGET_HEADER_BUILTIN(_InterlockedOr8_acq,  "ccD*c",       "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedOr8_nf,   "ccD*c",       "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedOr8_rel,  "ccD*c",       "nh", "intrin.h", ALL_MS_LANGUAGES, "")
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -304,6 +304,10 @@ Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
                         AtomicOrdering::Monotonic :
                         SuccessOrdering;

+  // The atomic instruction is marked volatile for consistency with MSVC. This
+  // blocks the few atomics optimizations that LLVM has. If we want to optimize
+  // _Interlocked* operations in the future, we will have to remove the volatile
+  // marker.
  auto *Result = CGF.Builder.CreateAtomicCmpXchg(
                   Destination, Comparand, Exchange,
                   SuccessOrdering, FailureOrdering);
@ -311,6 +315,68 @@ Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
  return CGF.Builder.CreateExtractValue(Result, 0);
 }

+// 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
+// prototyped like this:
+//
+// unsigned char _InterlockedCompareExchange128...(
+//     __int64 volatile * _Destination,
+//     __int64 _ExchangeHigh,
+//     __int64 _ExchangeLow,
+//     __int64 * _ComparandResult);
+static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
+                                              const CallExpr *E,
+                                              AtomicOrdering SuccessOrdering) {
+  assert(E->getNumArgs() == 4);
+  llvm::Value *Destination = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
+  llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
+  llvm::Value *ComparandPtr = CGF.EmitScalarExpr(E->getArg(3));
+
+  assert(Destination->getType()->isPointerTy());
+  assert(!ExchangeHigh->getType()->isPointerTy());
+  assert(!ExchangeLow->getType()->isPointerTy());
+  assert(ComparandPtr->getType()->isPointerTy());
+
+  // For Release ordering, the failure ordering should be Monotonic.
+  auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
+                             ? AtomicOrdering::Monotonic
+                             : SuccessOrdering;
+
+  // Convert to i128 pointers and values.
+  llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
+  llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
+  Destination = CGF.Builder.CreateBitCast(Destination, Int128PtrTy);
+  Address ComparandResult(CGF.Builder.CreateBitCast(ComparandPtr, Int128PtrTy),
+                          CGF.getContext().toCharUnitsFromBits(128));
+
+  // (((i128)hi) << 64) | ((i128)lo)
+  ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
+  ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
+  ExchangeHigh =
+      CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
+  llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
+
+  // Load the comparand for the instruction.
+  llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandResult);
+
+  auto *CXI = CGF.Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
+                                              SuccessOrdering, FailureOrdering);
+
+  // The atomic instruction is marked volatile for consistency with MSVC. This
+  // blocks the few atomics optimizations that LLVM has. If we want to optimize
+  // _Interlocked* operations in the future, we will have to remove the volatile
+  // marker.
+  CXI->setVolatile(true);
+
+  // Store the result as an outparameter.
+  CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
+                          ComparandResult);
+
+  // Get the success boolean and zero extend it to i8.
+  Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
+  return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
+}
+
 static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
    AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
  assert(E->getArg(0)->getType()->isPointerType());
@ -993,6 +1059,10 @@ enum class CodeGenFunction::MSVCIntrin {
  _InterlockedCompareExchange_acq,
  _InterlockedCompareExchange_rel,
  _InterlockedCompareExchange_nf,
+  _InterlockedCompareExchange128,
+  _InterlockedCompareExchange128_acq,
+  _InterlockedCompareExchange128_rel,
+  _InterlockedCompareExchange128_nf,
  _InterlockedOr_acq,
  _InterlockedOr_rel,
  _InterlockedOr_nf,
@ -1230,6 +1300,14 @@ translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
  case AArch64::BI_InterlockedCompareExchange_nf:
  case AArch64::BI_InterlockedCompareExchange64_nf:
    return MSVCIntrin::_InterlockedCompareExchange_nf;
+  case AArch64::BI_InterlockedCompareExchange128:
+    return MSVCIntrin::_InterlockedCompareExchange128;
+  case AArch64::BI_InterlockedCompareExchange128_acq:
+    return MSVCIntrin::_InterlockedCompareExchange128_acq;
+  case AArch64::BI_InterlockedCompareExchange128_nf:
+    return MSVCIntrin::_InterlockedCompareExchange128_nf;
+  case AArch64::BI_InterlockedCompareExchange128_rel:
+    return MSVCIntrin::_InterlockedCompareExchange128_rel;
  case AArch64::BI_InterlockedOr8_acq:
  case AArch64::BI_InterlockedOr16_acq:
  case AArch64::BI_InterlockedOr_acq:
@ -1317,6 +1395,8 @@ translateX86ToMsvcIntrin(unsigned BuiltinID) {
    return MSVCIntrin::_BitScanReverse;
  case clang::X86::BI_InterlockedAnd64:
    return MSVCIntrin::_InterlockedAnd;
+  case clang::X86::BI_InterlockedCompareExchange128:
+    return MSVCIntrin::_InterlockedCompareExchange128;
  case clang::X86::BI_InterlockedExchange64:
    return MSVCIntrin::_InterlockedExchange;
  case clang::X86::BI_InterlockedExchangeAdd64:
@ -1423,6 +1503,15 @@ Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
    return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
  case MSVCIntrin::_InterlockedCompareExchange_nf:
    return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
+  case MSVCIntrin::_InterlockedCompareExchange128:
+    return EmitAtomicCmpXchg128ForMSIntrin(
+        *this, E, AtomicOrdering::SequentiallyConsistent);
+  case MSVCIntrin::_InterlockedCompareExchange128_acq:
+    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
+  case MSVCIntrin::_InterlockedCompareExchange128_rel:
+    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
+  case MSVCIntrin::_InterlockedCompareExchange128_nf:
+    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
  case MSVCIntrin::_InterlockedOr_acq:
    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
                                 AtomicOrdering::Acquire);
@ -14032,42 +14121,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
    return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
                               llvm::SyncScope::SingleThread);
  }
-  case X86::BI_InterlockedCompareExchange128: {
-    // InterlockedCompareExchange128 doesn't directly refer to 128bit ints,
-    // instead it takes pointers to 64bit ints for Destination and
-    // ComparandResult, and exchange is taken as two 64bit ints (high & low).
-    // The previous value is written to ComparandResult, and success is
-    // returned.
-
-    llvm::Type *Int128Ty = Builder.getInt128Ty();
-    llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
-
-    Value *Destination =
-        Builder.CreateBitCast(Ops[0], Int128PtrTy);
-    Value *ExchangeHigh128 = Builder.CreateZExt(Ops[1], Int128Ty);
-    Value *ExchangeLow128 = Builder.CreateZExt(Ops[2], Int128Ty);
-    Address ComparandResult(Builder.CreateBitCast(Ops[3], Int128PtrTy),
-                            getContext().toCharUnitsFromBits(128));
-
-    Value *Exchange = Builder.CreateOr(
-        Builder.CreateShl(ExchangeHigh128, 64, "", false, false),
-        ExchangeLow128);
-
-    Value *Comparand = Builder.CreateLoad(ComparandResult);
-
-    AtomicCmpXchgInst *CXI =
-        Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange,
-                                    AtomicOrdering::SequentiallyConsistent,
-                                    AtomicOrdering::SequentiallyConsistent);
-    CXI->setVolatile(true);
-
-    // Write the result back to the inout pointer.
-    Builder.CreateStore(Builder.CreateExtractValue(CXI, 0), ComparandResult);
-
-    // Get the success boolean and zero extend it to i8.
-    Value *Success = Builder.CreateExtractValue(CXI, 1);
-    return Builder.CreateZExt(Success, ConvertType(E->getType()));
-  }

  case X86::BI_AddressOfReturnAddress: {
    Function *F =
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@ -214,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
 unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
 long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
                                    long _Comparand);
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_CompareandResult);
 unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
                                                __int64 _ExchangeHigh,
                                                __int64 _ExchangeLow,
@ -427,6 +423,26 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
 __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
                              __int64 _Exchange, __int64 _Comparand);
 #endif
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+#if defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif

 /*----------------------------------------------------------------------------*\
 |* movs, stos
--- a/clang/test/CodeGen/ms-intrinsics.c
+++ b/clang/test/CodeGen/ms-intrinsics.c
@ -6,10 +6,10 @@
 // RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-ARM,CHECK-ARM-ARM64,CHECK-ARM-X64
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple x86_64--windows -Oz -emit-llvm -target-feature +cx16 %s -o - \
-// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL
+// RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL,CHECK-64
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple aarch64-windows -Oz -emit-llvm %s -o - \
-// RUN:         | FileCheck %s --check-prefixes CHECK-ARM-ARM64,CHECK-ARM-X64,CHECK-ARM64
+// RUN:         | FileCheck %s --check-prefixes CHECK-ARM-ARM64,CHECK-ARM-X64,CHECK-ARM64,CHECK-64

 // intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
@ -432,32 +432,59 @@ __int64 test_InterlockedCompareExchange64(__int64 volatile *Destination, __int64
 // CHECK: ret i64 [[RESULT]]
 // CHECK: }

-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
 unsigned char test_InterlockedCompareExchange128(
    __int64 volatile *Destination, __int64 ExchangeHigh,
    __int64 ExchangeLow, __int64 *ComparandResult) {
  return _InterlockedCompareExchange128(++Destination, ++ExchangeHigh,
                                        ++ExchangeLow, ++ComparandResult);
 }
-// CHECK-X64: define{{.*}}i8 @test_InterlockedCompareExchange128(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, i64*{{[a-z_ ]*}}%ComparandResult){{.*}}{
-// CHECK-X64: %incdec.ptr = getelementptr inbounds i64, i64* %Destination, i64 1
-// CHECK-X64: %inc = add nsw i64 %ExchangeHigh, 1
-// CHECK-X64: %inc1 = add nsw i64 %ExchangeLow, 1
-// CHECK-X64: %incdec.ptr2 = getelementptr inbounds i64, i64* %ComparandResult, i64 1
-// CHECK-X64: [[DST:%[0-9]+]] = bitcast i64* %incdec.ptr to i128*
-// CHECK-X64: [[EH:%[0-9]+]] = zext i64 %inc to i128
-// CHECK-X64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
-// CHECK-X64: [[CNR:%[0-9]+]] = bitcast i64* %incdec.ptr2 to i128*
-// CHECK-X64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
-// CHECK-X64: [[EXP:%[0-9]+]] = or i128 [[EHS]], [[EL]]
-// CHECK-X64: [[ORG:%[0-9]+]] = load i128, i128* [[CNR]], align 16
-// CHECK-X64: [[RES:%[0-9]+]] = cmpxchg volatile i128* [[DST]], i128 [[ORG]], i128 [[EXP]] seq_cst seq_cst
-// CHECK-X64: [[OLD:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 0
-// CHECK-X64: store i128 [[OLD]], i128* [[CNR]], align 16
-// CHECK-X64: [[SUC1:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 1
-// CHECK-X64: [[SUC8:%[0-9]+]] = zext i1 [[SUC1]] to i8
-// CHECK-X64: ret i8 [[SUC8]]
-// CHECK-X64: }
+// CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, i64*{{[a-z_ ]*}}%ComparandResult){{.*}}{
+// CHECK-64: %incdec.ptr = getelementptr inbounds i64, i64* %Destination, i64 1
+// CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
+// CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
+// CHECK-64: %incdec.ptr2 = getelementptr inbounds i64, i64* %ComparandResult, i64 1
+// CHECK-64: [[DST:%[0-9]+]] = bitcast i64* %incdec.ptr to i128*
+// CHECK-64: [[CNR:%[0-9]+]] = bitcast i64* %incdec.ptr2 to i128*
+// CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128
+// CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
+// CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
+// CHECK-64: [[EXP:%[0-9]+]] = or i128 [[EHS]], [[EL]]
+// CHECK-64: [[ORG:%[0-9]+]] = load i128, i128* [[CNR]], align 16
+// CHECK-64: [[RES:%[0-9]+]] = cmpxchg volatile i128* [[DST]], i128 [[ORG]], i128 [[EXP]] seq_cst seq_cst
+// CHECK-64: [[OLD:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 0
+// CHECK-64: store i128 [[OLD]], i128* [[CNR]], align 16
+// CHECK-64: [[SUC1:%[0-9]+]] = extractvalue { i128, i1 } [[RES]], 1
+// CHECK-64: [[SUC8:%[0-9]+]] = zext i1 [[SUC1]] to i8
+// CHECK-64: ret i8 [[SUC8]]
+// CHECK-64: }
+#endif
+
+#if defined(__aarch64__)
+unsigned char test_InterlockedCompareExchange128_acq(
+    __int64 volatile *Destination, __int64 ExchangeHigh,
+    __int64 ExchangeLow, __int64 *ComparandResult) {
+  return _InterlockedCompareExchange128_acq(Destination, ExchangeHigh,
+                                            ExchangeLow, ComparandResult);
+}
+unsigned char test_InterlockedCompareExchange128_nf(
+    __int64 volatile *Destination, __int64 ExchangeHigh,
+    __int64 ExchangeLow, __int64 *ComparandResult) {
+  return _InterlockedCompareExchange128_nf(Destination, ExchangeHigh,
+                                           ExchangeLow, ComparandResult);
+}
+unsigned char test_InterlockedCompareExchange128_rel(
+    __int64 volatile *Destination, __int64 ExchangeHigh,
+    __int64 ExchangeLow, __int64 *ComparandResult) {
+  return _InterlockedCompareExchange128_rel(Destination, ExchangeHigh,
+                                            ExchangeLow, ComparandResult);
+}
+// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_acq({{.*}})
+// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} acquire acquire
+// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_nf({{.*}})
+// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} monotonic monotonic
+// CHECK-ARM64: define{{.*}}i8 @test_InterlockedCompareExchange128_rel({{.*}})
+// CHECK-ARM64: cmpxchg volatile i128* %{{.*}}, i128 %{{.*}}, i128 %{{.*}} release monotonic
 #endif

 short test_InterlockedIncrement16(short volatile *Addend) {