From 3afc797e42f8c745cbd4e2049c5e81c64fef5001 Mon Sep 17 00:00:00 2001
From: Kostya Kortchinsky <kostyak@google.com>
Date: Tue, 14 Aug 2018 18:34:52 +0000
Subject: [PATCH] [scudo] Fix race condition in deallocation path when
 Quarantine is bypassed

Summary:
There is a race window in the deallocation path when the Quarantine is bypassed.
Initially we would just erase the header of a chunk if we were not to use the
Quarantine, as opposed to using a compare-exchange primitive, to make things
faster.

It turned out to be a poor decision, as 2 threads (or more) could simultaneously
deallocate the same pointer, and if the checks were to done before the header
got erased, this would result in the pointer being added twice (or more) to
distinct thread caches, and eventually be reused.

Winning the race is not trivial but can happen with enough control over the
allocation primitives. The repro added attempts to trigger the bug, with a
moderate success rate, but it should be enough to notice if the bug ever make
its way back into the code.

Since I am changing things in this file, there are 2 smaller changes tagging
along, marking a variable `const`, and improving the Quarantine bypass test at
runtime.

Reviewers: alekseyshl, eugenis, kcc, vitalybuka

Reviewed By: eugenis, vitalybuka

Subscribers: delcypher, #sanitizers, llvm-commits

Differential Revision: https://reviews.llvm.org/D50655

llvm-svn: 339705
---
 compiler-rt/lib/scudo/scudo_allocator.cpp | 12 ++--
 compiler-rt/test/scudo/dealloc-race.c     | 69 +++++++++++++++++++++++
 2 files changed, 76 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/test/scudo/dealloc-race.c
diff --git a/compiler-rt/lib/scudo/scudo_allocator.cpp b/compiler-rt/lib/scudo/scudo_allocator.cpp
index 4a11bf5fcc21..df91fdc80cf5 100644
--- a/compiler-rt/lib/scudo/scudo_allocator.cpp
+++ b/compiler-rt/lib/scudo/scudo_allocator.cpp
@@ -264,7 +264,8 @@ struct Allocator {
     Quarantine.Init(
         static_cast<uptr>(getFlags()->QuarantineSizeKb) << 10,
         static_cast<uptr>(getFlags()->ThreadLocalQuarantineSizeKb) << 10);
-    QuarantineChunksUpToSize = getFlags()->QuarantineChunksUpToSize;
+    QuarantineChunksUpToSize = (Quarantine.GetCacheSize() == 0) ? 0 :
+        getFlags()->QuarantineChunksUpToSize;
     DeallocationTypeMismatch = getFlags()->DeallocationTypeMismatch;
     DeleteSizeMismatch = getFlags()->DeleteSizeMismatch;
     ZeroContents = getFlags()->ZeroContents;
@@ -389,10 +390,11 @@ struct Allocator {
   // quarantine chunk size threshold.
   void quarantineOrDeallocateChunk(void *Ptr, UnpackedHeader *Header,
                                    uptr Size) {
-    const bool BypassQuarantine = (Quarantine.GetCacheSize() == 0) ||
-        (Size > QuarantineChunksUpToSize);
+    const bool BypassQuarantine = !Size || (Size > QuarantineChunksUpToSize);
     if (BypassQuarantine) {
-      Chunk::eraseHeader(Ptr);
+      UnpackedHeader NewHeader = *Header;
+      NewHeader.State = ChunkAvailable;
+      Chunk::compareExchangeHeader(Ptr, &NewHeader, Header);
       void *BackendPtr = Chunk::getBackendPtr(Ptr, Header);
       if (Header->ClassId) {
         bool UnlockRequired;
@@ -675,7 +677,7 @@ void *scudoValloc(uptr Size) {
 }
 
 void *scudoPvalloc(uptr Size) {
-  uptr PageSize = GetPageSizeCached();
+  const uptr PageSize = GetPageSizeCached();
   if (UNLIKELY(CheckForPvallocOverflow(Size, PageSize))) {
     errno = ENOMEM;
     if (Instance.canReturnNull())
diff --git a/compiler-rt/test/scudo/dealloc-race.c b/compiler-rt/test/scudo/dealloc-race.c
new file mode 100644
index 000000000000..326e22f27373
--- /dev/null
+++ b/compiler-rt/test/scudo/dealloc-race.c
@@ -0,0 +1,69 @@
+// RUN: %clang_scudo %s -O2 -o %t
+// RUN: %env_scudo_opts="QuarantineChunksUpToSize=0" %run %t 2>&1
+
+// This test attempts to reproduce a race condition in the deallocation path
+// when bypassing the Quarantine. The old behavior was to zero-out the chunk
+// header after checking its checksum, state & various other things, but that
+// left a window during which 2 (or more) threads could deallocate the same
+// chunk, with a net result of having said chunk present in those distinct
+// thread caches.
+
+// A passing test means all the children died with an error. The failing
+// scenario involves winning a race, so repro can be scarce.
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+const int kNumThreads = 2;
+pthread_t tid[kNumThreads];
+
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+char go = 0;
+
+// Frees the pointer passed when signaled to.
+void *thread_free(void *p) {
+  pthread_mutex_lock(&mutex);
+  while (!go)
+    pthread_cond_wait(&cond, &mutex);
+  pthread_mutex_unlock(&mutex);
+  free(p);
+  return 0;
+}
+
+// Allocates a chunk, and attempts to free it "simultaneously" by 2 threads.
+void child(void) {
+  void *p = malloc(16);
+  for (int i = 0; i < kNumThreads; i++)
+    pthread_create(&tid[i], 0, thread_free, p);
+  pthread_mutex_lock(&mutex);
+  go = 1;
+  pthread_cond_broadcast(&cond);
+  pthread_mutex_unlock(&mutex);
+  for (int i = 0; i < kNumThreads; i++)
+    pthread_join(tid[i], 0);
+}
+
+int main(int argc, char** argv) {
+  const int kChildren = 40;
+  pid_t pid;
+  for (int i = 0; i < kChildren; ++i) {
+    pid = fork();
+    if (pid < 0) {
+      exit(1);
+    } else if (pid == 0) {
+      child();
+      exit(0);
+    } else {
+      int status;
+      wait(&status);
+      // A 0 status means the child didn't die with an error. The race was won.
+      if (status == 0)
+        exit(1);
+    }
+  }
+  return 0;
+}