[x86] use VPMOVMSK to replace memcmp libcalls for 32-byte equality

Follow-up to:
https://reviews.llvm.org/rL298775

llvm-svn: 298933
This commit is contained in:
Sanjay Patel 2017-03-28 17:23:49 +00:00
parent da4d12a8e5
commit f01a1dad7f
3 changed files with 52 additions and 28 deletions

View File

@ -6069,20 +6069,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
// supports the MVT we'll be loading or if it is small enough (<= 4) that
// we'll only produce a small number of byte loads.
MVT LoadVT;
switch (CSize->getZExtValue()) {
unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
switch (NumBitsToCompare) {
default:
return false;
case 2:
case 16:
LoadVT = MVT::i16;
break;
case 4:
case 32:
LoadVT = MVT::i32;
break;
case 8:
LoadVT = hasFastLoadsAndCompare(64);
break;
case 16:
LoadVT = hasFastLoadsAndCompare(128);
case 64:
case 128:
case 256:
LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
break;
}

View File

@ -4646,8 +4646,12 @@ MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
if (NumBits == 128 && isTypeLegal(MVT::v16i8))
return MVT::v16i8;
// VPMOVMSKB can handle this.
if (NumBits == 256 && isTypeLegal(MVT::v32i8))
return MVT::v32i8;
// TODO: Allow 64-bit type for 32-bit target.
// TODO: 256- and 512-bit types should be allowed, but make sure that those
// TODO: 512-bit types should be allowed, but make sure that those
// cases are handled in combineVectorSizedSetCCEquality().
return MVT::INVALID_SIMPLE_VALUE_TYPE;

View File

@ -249,15 +249,25 @@ define i1 @length32(i8* %x, i8* %y) nounwind {
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: length32:
; X64: # BB#0:
; X64-NEXT: pushq %rax
; X64-NEXT: movl $32, %edx
; X64-NEXT: callq memcmp
; X64-NEXT: testl %eax, %eax
; X64-NEXT: sete %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq
; SSE2-LABEL: length32:
; SSE2: # BB#0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movl $32, %edx
; SSE2-NEXT: callq memcmp
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: sete %al
; SSE2-NEXT: popq %rcx
; SSE2-NEXT: retq
;
; AVX2-LABEL: length32:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: cmpl $-1, %eax
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
@ -276,16 +286,26 @@ define i1 @length32_const(i8* %X, i32* nocapture %P) nounwind {
; X32-NEXT: setne %al
; X32-NEXT: retl
;
; X64-LABEL: length32_const:
; X64: # BB#0:
; X64-NEXT: pushq %rax
; X64-NEXT: movl $.L.str, %esi
; X64-NEXT: movl $32, %edx
; X64-NEXT: callq memcmp
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setne %al
; X64-NEXT: popq %rcx
; X64-NEXT: retq
; SSE2-LABEL: length32_const:
; SSE2: # BB#0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movl $.L.str, %esi
; SSE2-NEXT: movl $32, %edx
; SSE2-NEXT: callq memcmp
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: setne %al
; SSE2-NEXT: popq %rcx
; SSE2-NEXT: retq
;
; AVX2-LABEL: length32_const:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: cmpl $-1, %eax
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c