Merge "Revert "Cause Fatal error when invalid pthread_id is detected.""
diff --git a/libc/Android.mk b/libc/Android.mk
index 6f430cc..ebc59de 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -533,6 +533,9 @@
     bionic/pthread_setschedparam.cpp \
     bionic/pthread_sigmask.cpp \
 
+libc_thread_atexit_impl_src_files := \
+    bionic/__cxa_thread_atexit_impl.cpp \
+
 libc_arch_static_src_files := \
     bionic/dl_iterate_phdr_static.cpp \
 
@@ -1002,6 +1005,24 @@
 $(eval $(call patch-up-arch-specific-flags,LOCAL_SRC_FILES,libc_bionic_src_files))
 include $(BUILD_STATIC_LIBRARY)
 
+include $(CLEAR_VARS)
+LOCAL_SRC_FILES := $(libc_thread_atexit_impl_src_files)
+LOCAL_CFLAGS := $(libc_common_cflags) -fno-data-sections -Wframe-larger-than=2048
+
+LOCAL_CONLYFLAGS := $(libc_common_conlyflags)
+LOCAL_CPPFLAGS := $(libc_common_cppflags) -Wold-style-cast
+LOCAL_C_INCLUDES := $(libc_common_c_includes)
+LOCAL_MODULE := libc_thread_atexit_impl
+# TODO: Clang tries to use __tls_get_addr which is not supported yet
+# remove after it is implemented.
+LOCAL_CLANG := false
+LOCAL_ADDITIONAL_DEPENDENCIES := $(libc_common_additional_dependencies)
+LOCAL_CXX_STL := none
+LOCAL_SYSTEM_SHARED_LIBRARIES :=
+LOCAL_ADDRESS_SANITIZER := false
+LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
+
+include $(BUILD_STATIC_LIBRARY)
 
 # ========================================================
 # libc_pthread.a - pthreads parts that previously lived in
@@ -1206,6 +1227,7 @@
     libc_pthread \
     libc_stack_protector \
     libc_syscalls \
+    libc_thread_atexit_impl \
     libc_tzcode \
 
 LOCAL_WHOLE_STATIC_LIBRARIES_arm := libc_aeabi
diff --git a/libc/bionic/__cxa_thread_atexit_impl.cpp b/libc/bionic/__cxa_thread_atexit_impl.cpp
new file mode 100644
index 0000000..9ae6dfd
--- /dev/null
+++ b/libc/bionic/__cxa_thread_atexit_impl.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/cdefs.h>
+
+struct thread_local_dtor {
+  void (*func) (void *);
+  void *arg;
+  void *dso_handle; // unused...
+  thread_local_dtor* next;
+};
+
+__thread thread_local_dtor* thread_local_dtors = nullptr;
+
+extern "C" int __cxa_thread_atexit_impl(void (*func) (void *), void *arg, void *dso_handle) {
+  thread_local_dtor* dtor = new thread_local_dtor();
+
+  dtor->func = func;
+  dtor->arg = arg;
+  dtor->dso_handle = dso_handle;
+  dtor->next = thread_local_dtors;
+
+  thread_local_dtors = dtor;
+
+  return 0;
+}
+
+extern "C" __LIBC_HIDDEN__ void __cxa_thread_finalize() {
+  while (thread_local_dtors != nullptr) {
+    thread_local_dtor* current = thread_local_dtors;
+    thread_local_dtors = current->next;
+
+    current->func(current->arg);
+    delete current;
+  }
+}
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 3d73d52..66632c4 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -267,6 +267,7 @@
     // Mark the thread detached and replace its start_routine with a no-op.
     // Letting the thread run is the easiest way to clean up its resources.
     atomic_store(&thread->join_state, THREAD_DETACHED);
+    __pthread_internal_add(thread);
     thread->start_routine = __do_nothing;
     pthread_mutex_unlock(&thread->startup_handshake_mutex);
     return init_errno;
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index c2232a9..1de85f5 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -37,6 +37,7 @@
 extern "C" __noreturn void _exit_with_stack_teardown(void*, size_t);
 extern "C" __noreturn void __exit(int);
 extern "C" int __set_tid_address(int*);
+extern "C" void __cxa_thread_finalize();
 
 /* CAVEAT: our implementation of pthread_cleanup_push/pop doesn't support C++ exceptions
  *         and thread cancelation
@@ -59,10 +60,13 @@
 }
 
 void pthread_exit(void* return_value) {
+  // Call dtors for thread_local objects first.
+  __cxa_thread_finalize();
+
   pthread_internal_t* thread = __get_thread();
   thread->return_value = return_value;
 
-  // Call the cleanup handlers first.
+  // Call the cleanup handlers.
   while (thread->cleanup_stack) {
     __pthread_cleanup_t* c = thread->cleanup_stack;
     thread->cleanup_stack = c->__cleanup_prev;
diff --git a/libc/bionic/pthread_mutex.cpp b/libc/bionic/pthread_mutex.cpp
index 24066ae..d2ff1ae 100644
--- a/libc/bionic/pthread_mutex.cpp
+++ b/libc/bionic/pthread_mutex.cpp
@@ -31,6 +31,7 @@
 #include <errno.h>
 #include <limits.h>
 #include <stdatomic.h>
+#include <string.h>
 #include <sys/cdefs.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -80,7 +81,7 @@
 #define  MUTEX_STATE_FROM_BITS(v)   FIELD_FROM_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
 #define  MUTEX_STATE_TO_BITS(v)     FIELD_TO_BITS(v, MUTEX_STATE_SHIFT, MUTEX_STATE_LEN)
 
-#define  MUTEX_STATE_UNLOCKED            0   /* must be 0 to match __PTHREAD_MUTEX_INIT_VALUE */
+#define  MUTEX_STATE_UNLOCKED            0   /* must be 0 to match PTHREAD_MUTEX_INITIALIZER */
 #define  MUTEX_STATE_LOCKED_UNCONTENDED  1   /* must be 1 due to atomic dec in unlock operation */
 #define  MUTEX_STATE_LOCKED_CONTENDED    2   /* must be 1 + LOCKED_UNCONTENDED due to atomic dec */
 
@@ -122,30 +123,17 @@
 #define  MUTEX_SHARED_MASK     FIELD_MASK(MUTEX_SHARED_SHIFT,1)
 
 /* Mutex type:
- *
  * We support normal, recursive and errorcheck mutexes.
- *
- * The constants defined here *cannot* be changed because they must match
- * the C library ABI which defines the following initialization values in
- * <pthread.h>:
- *
- *   __PTHREAD_MUTEX_INIT_VALUE
- *   __PTHREAD_RECURSIVE_MUTEX_VALUE
- *   __PTHREAD_ERRORCHECK_MUTEX_INIT_VALUE
  */
 #define  MUTEX_TYPE_SHIFT      14
 #define  MUTEX_TYPE_LEN        2
 #define  MUTEX_TYPE_MASK       FIELD_MASK(MUTEX_TYPE_SHIFT,MUTEX_TYPE_LEN)
 
-#define  MUTEX_TYPE_NORMAL          0  /* Must be 0 to match __PTHREAD_MUTEX_INIT_VALUE */
-#define  MUTEX_TYPE_RECURSIVE       1
-#define  MUTEX_TYPE_ERRORCHECK      2
-
 #define  MUTEX_TYPE_TO_BITS(t)       FIELD_TO_BITS(t, MUTEX_TYPE_SHIFT, MUTEX_TYPE_LEN)
 
-#define  MUTEX_TYPE_BITS_NORMAL      MUTEX_TYPE_TO_BITS(MUTEX_TYPE_NORMAL)
-#define  MUTEX_TYPE_BITS_RECURSIVE   MUTEX_TYPE_TO_BITS(MUTEX_TYPE_RECURSIVE)
-#define  MUTEX_TYPE_BITS_ERRORCHECK  MUTEX_TYPE_TO_BITS(MUTEX_TYPE_ERRORCHECK)
+#define  MUTEX_TYPE_BITS_NORMAL      MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_NORMAL)
+#define  MUTEX_TYPE_BITS_RECURSIVE   MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_RECURSIVE)
+#define  MUTEX_TYPE_BITS_ERRORCHECK  MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_ERRORCHECK)
 
 /* Mutex owner field:
  *
@@ -237,55 +225,66 @@
     return 0;
 }
 
-static inline atomic_int* get_mutex_value_pointer(pthread_mutex_t* mutex) {
-    static_assert(sizeof(atomic_int) == sizeof(mutex->value),
-                  "mutex->value should actually be atomic_int in implementation.");
+struct pthread_mutex_internal_t {
+  atomic_int state;
+#if defined(__LP64__)
+  char __reserved[36];
+#endif
+};
 
-    // We prefer casting to atomic_int instead of declaring mutex->value to be atomic_int directly.
-    // Because using the second method pollutes pthread.h, and causes an error when compiling libcxx.
-    return reinterpret_cast<atomic_int*>(&mutex->value);
+static_assert(sizeof(pthread_mutex_t) == sizeof(pthread_mutex_internal_t),
+              "pthread_mutex_t should actually be pthread_mutex_internal_t in implementation.");
+
+// For binary compatibility with old version of pthread_mutex_t, we can't use more strict alignment
+// than 4-byte alignment.
+static_assert(alignof(pthread_mutex_t) == 4,
+              "pthread_mutex_t should fulfill the alignment of pthread_mutex_internal_t.");
+
+static inline pthread_mutex_internal_t* __get_internal_mutex(pthread_mutex_t* mutex_interface) {
+  return reinterpret_cast<pthread_mutex_internal_t*>(mutex_interface);
 }
 
-int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr) {
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
+int pthread_mutex_init(pthread_mutex_t* mutex_interface, const pthread_mutexattr_t* attr) {
+    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
+
+    memset(mutex, 0, sizeof(pthread_mutex_internal_t));
 
     if (__predict_true(attr == NULL)) {
-        atomic_init(mutex_value_ptr, MUTEX_TYPE_BITS_NORMAL);
+        atomic_init(&mutex->state, MUTEX_TYPE_BITS_NORMAL);
         return 0;
     }
 
-    int value = 0;
+    int state = 0;
     if ((*attr & MUTEXATTR_SHARED_MASK) != 0) {
-        value |= MUTEX_SHARED_MASK;
+        state |= MUTEX_SHARED_MASK;
     }
 
     switch (*attr & MUTEXATTR_TYPE_MASK) {
     case PTHREAD_MUTEX_NORMAL:
-        value |= MUTEX_TYPE_BITS_NORMAL;
+        state |= MUTEX_TYPE_BITS_NORMAL;
         break;
     case PTHREAD_MUTEX_RECURSIVE:
-        value |= MUTEX_TYPE_BITS_RECURSIVE;
+        state |= MUTEX_TYPE_BITS_RECURSIVE;
         break;
     case PTHREAD_MUTEX_ERRORCHECK:
-        value |= MUTEX_TYPE_BITS_ERRORCHECK;
+        state |= MUTEX_TYPE_BITS_ERRORCHECK;
         break;
     default:
         return EINVAL;
     }
 
-    atomic_init(mutex_value_ptr, value);
+    atomic_init(&mutex->state, state);
     return 0;
 }
 
-static inline int __pthread_normal_mutex_trylock(atomic_int* mutex_value_ptr, int shared) {
+static inline __always_inline int __pthread_normal_mutex_trylock(pthread_mutex_internal_t* mutex,
+                                                                 int shared) {
     const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
     const int locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
 
-    int mvalue = unlocked;
-    if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue,
-                                                locked_uncontended,
-                                                memory_order_acquire,
-                                                memory_order_relaxed))) {
+    int old_state = unlocked;
+    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
+                         locked_uncontended, memory_order_acquire, memory_order_relaxed))) {
         return 0;
     }
     return EBUSY;
@@ -303,9 +302,11 @@
  * "type" value is zero, so the only bits that will be set are the ones in
  * the lock state field.
  */
-static inline int __pthread_normal_mutex_lock(atomic_int* mutex_value_ptr, int shared,
-                                              const timespec* abs_timeout_or_null, clockid_t clock) {
-    if (__predict_true(__pthread_normal_mutex_trylock(mutex_value_ptr, shared) == 0)) {
+static inline __always_inline int __pthread_normal_mutex_lock(pthread_mutex_internal_t* mutex,
+                                                              int shared,
+                                                              const timespec* abs_timeout_or_null,
+                                                              clockid_t clock) {
+    if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
         return 0;
     }
 
@@ -316,13 +317,13 @@
 
     // We want to go to sleep until the mutex is available, which requires
     // promoting it to locked_contended. We need to swap in the new state
-    // value and then wait until somebody wakes us up.
+    // and then wait until somebody wakes us up.
     // An atomic_exchange is used to compete with other threads for the lock.
     // If it returns unlocked, we have acquired the lock, otherwise another
     // thread still holds the lock and we should wait again.
     // If lock is acquired, an acquire fence is needed to make all memory accesses
     // made by other threads visible to the current CPU.
-    while (atomic_exchange_explicit(mutex_value_ptr, locked_contended,
+    while (atomic_exchange_explicit(&mutex->state, locked_contended,
                                     memory_order_acquire) != unlocked) {
         timespec ts;
         timespec* rel_timeout = NULL;
@@ -332,7 +333,7 @@
                 return ETIMEDOUT;
             }
         }
-        if (__futex_wait_ex(mutex_value_ptr, shared, locked_contended, rel_timeout) == -ETIMEDOUT) {
+        if (__futex_wait_ex(&mutex->state, shared, locked_contended, rel_timeout) == -ETIMEDOUT) {
             return ETIMEDOUT;
         }
     }
@@ -343,7 +344,8 @@
  * Release a mutex of type NORMAL.  The caller is responsible for determining
  * that we are in fact the owner of this lock.
  */
-static inline void __pthread_normal_mutex_unlock(atomic_int* mutex_value_ptr, int shared) {
+static inline __always_inline void __pthread_normal_mutex_unlock(pthread_mutex_internal_t* mutex,
+                                                                 int shared) {
     const int unlocked         = shared | MUTEX_STATE_BITS_UNLOCKED;
     const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
 
@@ -352,7 +354,7 @@
     // one of them.
     // A release fence is required to make previous stores visible to next
     // lock owner threads.
-    if (atomic_exchange_explicit(mutex_value_ptr, unlocked,
+    if (atomic_exchange_explicit(&mutex->state, unlocked,
                                  memory_order_release) == locked_contended) {
         // Wake up one waiting thread. We don't know which thread will be
         // woken or when it'll start executing -- futexes make no guarantees
@@ -372,7 +374,7 @@
         // we call wake, the thread we eventually wake will find an unlocked mutex
         // and will execute. Either way we have correct behavior and nobody is
         // orphaned on the wait queue.
-        __futex_wake_ex(mutex_value_ptr, shared, 1);
+        __futex_wake_ex(&mutex->state, shared, 1);
     }
 }
 
@@ -382,11 +384,12 @@
  * Otherwise, it atomically increments the counter and returns 0.
  *
  */
-static inline int __recursive_increment(atomic_int* mutex_value_ptr, int mvalue) {
+static inline __always_inline int __recursive_increment(pthread_mutex_internal_t* mutex,
+                                                        int old_state) {
     // Detect recursive lock overflow and return EAGAIN.
     // This is safe because only the owner thread can modify the
     // counter bits in the mutex value.
-    if (MUTEX_COUNTER_BITS_WILL_OVERFLOW(mvalue)) {
+    if (MUTEX_COUNTER_BITS_WILL_OVERFLOW(old_state)) {
         return EAGAIN;
     }
 
@@ -395,32 +398,30 @@
     // loop to update the counter. The counter will not overflow in the loop,
     // as only the owner thread can change it.
     // The mutex is still locked, so we don't need a release fence.
-    atomic_fetch_add_explicit(mutex_value_ptr, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
+    atomic_fetch_add_explicit(&mutex->state, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
     return 0;
 }
 
-static int __pthread_mutex_lock_with_timeout(pthread_mutex_t* mutex,
+static int __pthread_mutex_lock_with_timeout(pthread_mutex_internal_t* mutex,
                                            const timespec* abs_timeout_or_null, clockid_t clock) {
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
+    int old_state, mtype, tid, shared;
 
-    int mvalue, mtype, tid, shared;
-
-    mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
-    mtype = (mvalue & MUTEX_TYPE_MASK);
-    shared = (mvalue & MUTEX_SHARED_MASK);
+    old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    mtype = (old_state & MUTEX_TYPE_MASK);
+    shared = (old_state & MUTEX_SHARED_MASK);
 
     // Handle common case first.
     if ( __predict_true(mtype == MUTEX_TYPE_BITS_NORMAL) ) {
-        return __pthread_normal_mutex_lock(mutex_value_ptr, shared, abs_timeout_or_null, clock);
+        return __pthread_normal_mutex_lock(mutex, shared, abs_timeout_or_null, clock);
     }
 
     // Do we already own this recursive or error-check mutex?
     tid = __get_thread()->tid;
-    if (tid == MUTEX_OWNER_FROM_BITS(mvalue)) {
+    if (tid == MUTEX_OWNER_FROM_BITS(old_state)) {
         if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
             return EDEADLK;
         }
-        return __recursive_increment(mutex_value_ptr, mvalue);
+        return __recursive_increment(mutex, old_state);
     }
 
     const int unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
@@ -429,12 +430,12 @@
 
     // First, if the mutex is unlocked, try to quickly acquire it.
     // In the optimistic case where this works, set the state to locked_uncontended.
-    if (mvalue == unlocked) {
-        int newval = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
+    if (old_state == unlocked) {
+        int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
         // If exchanged successfully, an acquire fence is required to make
         // all memory accesses made by other threads visible to the current CPU.
-        if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue,
-                           newval, memory_order_acquire, memory_order_relaxed))) {
+        if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
+                             new_state, memory_order_acquire, memory_order_relaxed))) {
             return 0;
         }
     }
@@ -442,33 +443,33 @@
     ScopedTrace trace("Contending for pthread mutex");
 
     while (true) {
-        if (mvalue == unlocked) {
+        if (old_state == unlocked) {
             // NOTE: We put the state to locked_contended since we _know_ there
             // is contention when we are in this loop. This ensures all waiters
             // will be unlocked.
 
-            int newval = MUTEX_OWNER_TO_BITS(tid) | locked_contended;
+            int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_contended;
             // If exchanged successfully, an acquire fence is required to make
             // all memory accesses made by other threads visible to the current CPU.
-            if (__predict_true(atomic_compare_exchange_weak_explicit(mutex_value_ptr,
-                                                                     &mvalue, newval,
+            if (__predict_true(atomic_compare_exchange_weak_explicit(&mutex->state,
+                                                                     &old_state, new_state,
                                                                      memory_order_acquire,
                                                                      memory_order_relaxed))) {
                 return 0;
             }
             continue;
-        } else if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(mvalue)) {
+        } else if (MUTEX_STATE_BITS_IS_LOCKED_UNCONTENDED(old_state)) {
             // We should set it to locked_contended beforing going to sleep. This can make
             // sure waiters will be woken up eventually.
 
-            int newval = MUTEX_STATE_BITS_FLIP_CONTENTION(mvalue);
-            if (__predict_false(!atomic_compare_exchange_weak_explicit(mutex_value_ptr,
-                                                                       &mvalue, newval,
+            int new_state = MUTEX_STATE_BITS_FLIP_CONTENTION(old_state);
+            if (__predict_false(!atomic_compare_exchange_weak_explicit(&mutex->state,
+                                                                       &old_state, new_state,
                                                                        memory_order_relaxed,
                                                                        memory_order_relaxed))) {
                 continue;
             }
-            mvalue = newval;
+            old_state = new_state;
         }
 
         // We are in locked_contended state, sleep until someone wakes us up.
@@ -480,54 +481,54 @@
                 return ETIMEDOUT;
             }
         }
-        if (__futex_wait_ex(mutex_value_ptr, shared, mvalue, rel_timeout) == -ETIMEDOUT) {
+        if (__futex_wait_ex(&mutex->state, shared, old_state, rel_timeout) == -ETIMEDOUT) {
             return ETIMEDOUT;
         }
-        mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
+        old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
     }
 }
 
-int pthread_mutex_lock(pthread_mutex_t* mutex) {
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
+int pthread_mutex_lock(pthread_mutex_t* mutex_interface) {
+    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
-    int mtype = (mvalue & MUTEX_TYPE_MASK);
-    int shared = (mvalue & MUTEX_SHARED_MASK);
+    int old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    int mtype = (old_state & MUTEX_TYPE_MASK);
+    int shared = (old_state & MUTEX_SHARED_MASK);
     // Avoid slowing down fast path of normal mutex lock operation.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-      if (__predict_true(__pthread_normal_mutex_trylock(mutex_value_ptr, shared) == 0)) {
+      if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
         return 0;
       }
     }
     return __pthread_mutex_lock_with_timeout(mutex, NULL, 0);
 }
 
-int pthread_mutex_unlock(pthread_mutex_t* mutex) {
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
+int pthread_mutex_unlock(pthread_mutex_t* mutex_interface) {
+    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int mvalue, mtype, tid, shared;
+    int old_state, mtype, tid, shared;
 
-    mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
-    mtype  = (mvalue & MUTEX_TYPE_MASK);
-    shared = (mvalue & MUTEX_SHARED_MASK);
+    old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    mtype  = (old_state & MUTEX_TYPE_MASK);
+    shared = (old_state & MUTEX_SHARED_MASK);
 
     // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-        __pthread_normal_mutex_unlock(mutex_value_ptr, shared);
+        __pthread_normal_mutex_unlock(mutex, shared);
         return 0;
     }
 
     // Do we already own this recursive or error-check mutex?
     tid = __get_thread()->tid;
-    if ( tid != MUTEX_OWNER_FROM_BITS(mvalue) )
+    if ( tid != MUTEX_OWNER_FROM_BITS(old_state) )
         return EPERM;
 
     // If the counter is > 0, we can simply decrement it atomically.
     // Since other threads can mutate the lower state bits (and only the
     // lower state bits), use a compare_exchange loop to do it.
-    if (!MUTEX_COUNTER_BITS_IS_ZERO(mvalue)) {
+    if (!MUTEX_COUNTER_BITS_IS_ZERO(old_state)) {
         // We still own the mutex, so a release fence is not needed.
-        atomic_fetch_sub_explicit(mutex_value_ptr, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
+        atomic_fetch_sub_explicit(&mutex->state, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
         return 0;
     }
 
@@ -538,36 +539,36 @@
     // A release fence is required to make previous stores visible to next
     // lock owner threads.
     const int unlocked = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
-    mvalue = atomic_exchange_explicit(mutex_value_ptr, unlocked, memory_order_release);
-    if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(mvalue)) {
-        __futex_wake_ex(mutex_value_ptr, shared, 1);
+    old_state = atomic_exchange_explicit(&mutex->state, unlocked, memory_order_release);
+    if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(old_state)) {
+        __futex_wake_ex(&mutex->state, shared, 1);
     }
 
     return 0;
 }
 
-int pthread_mutex_trylock(pthread_mutex_t* mutex) {
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
+int pthread_mutex_trylock(pthread_mutex_t* mutex_interface) {
+    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int mvalue = atomic_load_explicit(mutex_value_ptr, memory_order_relaxed);
-    int mtype  = (mvalue & MUTEX_TYPE_MASK);
-    int shared = (mvalue & MUTEX_SHARED_MASK);
+    int old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    int mtype  = (old_state & MUTEX_TYPE_MASK);
+    int shared = (old_state & MUTEX_SHARED_MASK);
 
     const int unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
     const int locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
 
     // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
-        return __pthread_normal_mutex_trylock(mutex_value_ptr, shared);
+        return __pthread_normal_mutex_trylock(mutex, shared);
     }
 
     // Do we already own this recursive or error-check mutex?
     pid_t tid = __get_thread()->tid;
-    if (tid == MUTEX_OWNER_FROM_BITS(mvalue)) {
+    if (tid == MUTEX_OWNER_FROM_BITS(old_state)) {
         if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
             return EBUSY;
         }
-        return __recursive_increment(mutex_value_ptr, mvalue);
+        return __recursive_increment(mutex, old_state);
     }
 
     // Same as pthread_mutex_lock, except that we don't want to wait, and
@@ -575,9 +576,9 @@
     // lock if it is released / not owned by anyone. No need for a complex loop.
     // If exchanged successfully, an acquire fence is required to make
     // all memory accesses made by other threads visible to the current CPU.
-    mvalue = unlocked;
-    int newval = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
-    if (__predict_true(atomic_compare_exchange_strong_explicit(mutex_value_ptr, &mvalue, newval,
+    old_state = unlocked;
+    int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
+    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state, new_state,
                                                                memory_order_acquire,
                                                                memory_order_relaxed))) {
         return 0;
@@ -586,7 +587,7 @@
 }
 
 #if !defined(__LP64__)
-extern "C" int pthread_mutex_lock_timeout_np(pthread_mutex_t* mutex, unsigned ms) {
+extern "C" int pthread_mutex_lock_timeout_np(pthread_mutex_t* mutex_interface, unsigned ms) {
     timespec abs_timeout;
     clock_gettime(CLOCK_MONOTONIC, &abs_timeout);
     abs_timeout.tv_sec  += ms / 1000;
@@ -596,7 +597,8 @@
         abs_timeout.tv_nsec -= NS_PER_S;
     }
 
-    int error = __pthread_mutex_lock_with_timeout(mutex, &abs_timeout, CLOCK_MONOTONIC);
+    int error = __pthread_mutex_lock_with_timeout(__get_internal_mutex(mutex_interface),
+                                                  &abs_timeout, CLOCK_MONOTONIC);
     if (error == ETIMEDOUT) {
         error = EBUSY;
     }
@@ -604,18 +606,19 @@
 }
 #endif
 
-int pthread_mutex_timedlock(pthread_mutex_t* mutex, const timespec* abs_timeout) {
-    return __pthread_mutex_lock_with_timeout(mutex, abs_timeout, CLOCK_REALTIME);
+int pthread_mutex_timedlock(pthread_mutex_t* mutex_interface, const timespec* abs_timeout) {
+    return __pthread_mutex_lock_with_timeout(__get_internal_mutex(mutex_interface),
+                                             abs_timeout, CLOCK_REALTIME);
 }
 
-int pthread_mutex_destroy(pthread_mutex_t* mutex) {
+int pthread_mutex_destroy(pthread_mutex_t* mutex_interface) {
     // Use trylock to ensure that the mutex is valid and not already locked.
-    int error = pthread_mutex_trylock(mutex);
+    int error = pthread_mutex_trylock(mutex_interface);
     if (error != 0) {
         return error;
     }
 
-    atomic_int* mutex_value_ptr = get_mutex_value_pointer(mutex);
-    atomic_store_explicit(mutex_value_ptr, 0xdead10cc, memory_order_relaxed);
+    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
+    atomic_store_explicit(&mutex->state, 0xdead10cc, memory_order_relaxed);
     return 0;
 }
diff --git a/libc/include/pthread.h b/libc/include/pthread.h
index 234a43d..83a56d6 100644
--- a/libc/include/pthread.h
+++ b/libc/include/pthread.h
@@ -36,30 +36,15 @@
 #include <sys/types.h>
 #include <time.h>
 
-#if defined(__LP64__)
-  #define __RESERVED_INITIALIZER , {0}
-#else
-  #define __RESERVED_INITIALIZER
-#endif
-
 typedef struct {
-  int value;
-#ifdef __LP64__
-  char __reserved[36];
+#if defined(__LP64__)
+  int32_t __private[10];
+#else
+  int32_t __private[1];
 #endif
 } pthread_mutex_t;
 
-#define  __PTHREAD_MUTEX_INIT_VALUE            0
-#define  __PTHREAD_RECURSIVE_MUTEX_INIT_VALUE  0x4000
-#define  __PTHREAD_ERRORCHECK_MUTEX_INIT_VALUE 0x8000
-
-#define PTHREAD_MUTEX_INITIALIZER {__PTHREAD_MUTEX_INIT_VALUE __RESERVED_INITIALIZER}
-#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP {__PTHREAD_ERRORCHECK_MUTEX_INIT_VALUE __RESERVED_INITIALIZER}
-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {__PTHREAD_RECURSIVE_MUTEX_INIT_VALUE __RESERVED_INITIALIZER}
-
-/* TODO: remove this namespace pollution. */
-#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP
+typedef long pthread_mutexattr_t;
 
 enum {
     PTHREAD_MUTEX_NORMAL = 0,
@@ -72,28 +57,31 @@
     PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL
 };
 
+#define PTHREAD_MUTEX_INITIALIZER { { ((PTHREAD_MUTEX_NORMAL & 3) << 14) } }
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP { { ((PTHREAD_MUTEX_RECURSIVE & 3) << 14) } }
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP { { ((PTHREAD_MUTEX_ERRORCHECK & 3) << 14) } }
+
 typedef struct {
 #if defined(__LP64__)
-  char __private[48];
+  int32_t __private[12];
 #else
-  char __private[4];
+  int32_t __private[1];
 #endif
-} pthread_cond_t __attribute__((aligned(4)));
+} pthread_cond_t;
+
+typedef long pthread_condattr_t;
 
 #define PTHREAD_COND_INITIALIZER  { { 0 } }
 
-typedef long pthread_mutexattr_t;
-typedef long pthread_condattr_t;
-
-typedef long pthread_rwlockattr_t;
-
 typedef struct {
 #if defined(__LP64__)
-  char __private[56];
+  int32_t __private[14];
 #else
-  char __private[40];
+  int32_t __private[10];
 #endif
-} pthread_rwlock_t __attribute__((aligned(4)));
+} pthread_rwlock_t;
+
+typedef long pthread_rwlockattr_t;
 
 #define PTHREAD_RWLOCK_INITIALIZER  { { 0 } }
 
diff --git a/libc/stdio/fileext.h b/libc/stdio/fileext.h
index 209815a..6cacc0f 100644
--- a/libc/stdio/fileext.h
+++ b/libc/stdio/fileext.h
@@ -61,7 +61,11 @@
 	_UB(fp)._base = NULL; \
 	_UB(fp)._size = 0; \
 	WCIO_INIT(fp); \
-	_FLOCK(fp).value = __PTHREAD_RECURSIVE_MUTEX_INIT_VALUE; \
+	pthread_mutexattr_t attr; \
+	pthread_mutexattr_init(&attr); \
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); \
+	pthread_mutex_init(&_FLOCK(fp), &attr); \
+	pthread_mutexattr_destroy(&attr); \
 	_EXT(fp)->_stdio_handles_locking = true; \
 } while (0)
 
diff --git a/tests/Android.mk b/tests/Android.mk
index 0a83e84..8804b71 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -258,10 +258,12 @@
     libtinyxml2 \
     liblog \
 
+# TODO: Include __cxa_thread_atexit_test.cpp to glibc tests once it is upgraded (glibc 2.18+)
 bionic-unit-tests_src_files := \
     atexit_test.cpp \
     dl_test.cpp \
     dlext_test.cpp \
+    __cxa_thread_atexit_test.cpp \
     dlfcn_test.cpp \
 
 bionic-unit-tests_cflags := $(test_cflags)
diff --git a/tests/__cxa_thread_atexit_test.cpp b/tests/__cxa_thread_atexit_test.cpp
new file mode 100644
index 0000000..0177314
--- /dev/null
+++ b/tests/__cxa_thread_atexit_test.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <stdint.h>
+
+#include <string>
+
+extern "C" int __cxa_thread_atexit_impl(void (*fn)(void*), void* arg, void* dso_handle);
+
+static void thread_atexit_fn1(void* arg) {
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "one, ";
+}
+
+static void thread_atexit_fn2(void* arg) {
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "two, ";
+}
+
+static void thread_atexit_from_atexit(void* arg) {
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "oops, ";
+}
+
+static void thread_atexit_fn3(void* arg) {
+  __cxa_thread_atexit_impl(thread_atexit_from_atexit, arg, nullptr);
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "three, ";
+}
+
+static void thread_atexit_fn4(void* arg) {
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "four, ";
+}
+
+static void thread_atexit_fn5(void* arg) {
+  std::string* call_sequence = static_cast<std::string*>(arg);
+  *call_sequence += "five.";
+}
+
+static void* thread_main(void* arg) {
+  __cxa_thread_atexit_impl(thread_atexit_fn5, arg, nullptr);
+  __cxa_thread_atexit_impl(thread_atexit_fn4, arg, nullptr);
+  __cxa_thread_atexit_impl(thread_atexit_fn3, arg, nullptr);
+  __cxa_thread_atexit_impl(thread_atexit_fn2, arg, nullptr);
+  __cxa_thread_atexit_impl(thread_atexit_fn1, arg, nullptr);
+  return nullptr;
+}
+
+TEST(__cxa_thread_atexit_impl, smoke) {
+  std::string atexit_call_sequence;
+
+  pthread_t t;
+  ASSERT_EQ(0, pthread_create(&t, nullptr, thread_main, &atexit_call_sequence));
+  ASSERT_EQ(0, pthread_join(t, nullptr));
+  ASSERT_EQ("one, two, three, oops, four, five.", atexit_call_sequence);
+}
+
+
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index 4eb352d..13d743f 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -16,10 +16,6 @@
 
 #include <gtest/gtest.h>
 
-#include "private/ScopeGuard.h"
-#include "BionicDeathTest.h"
-#include "ScopedSignalHandler.h"
-
 #include <errno.h>
 #include <inttypes.h>
 #include <limits.h>
@@ -35,6 +31,11 @@
 #include <atomic>
 #include <vector>
 
+#include "private/bionic_macros.h"
+#include "private/ScopeGuard.h"
+#include "BionicDeathTest.h"
+#include "ScopedSignalHandler.h"
+
 TEST(pthread, pthread_key_create) {
   pthread_key_t key;
   ASSERT_EQ(0, pthread_key_create(&key, NULL));
@@ -1189,54 +1190,84 @@
   ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
 }
 
-static void CreateMutex(pthread_mutex_t& mutex, int mutex_type) {
-  pthread_mutexattr_t attr;
-  ASSERT_EQ(0, pthread_mutexattr_init(&attr));
-  ASSERT_EQ(0, pthread_mutexattr_settype(&attr, mutex_type));
-  ASSERT_EQ(0, pthread_mutex_init(&mutex, &attr));
-  ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
-}
+struct PthreadMutex {
+  pthread_mutex_t lock;
+
+  PthreadMutex(int mutex_type) {
+    init(mutex_type);
+  }
+
+  ~PthreadMutex() {
+    destroy();
+  }
+
+ private:
+  void init(int mutex_type) {
+    pthread_mutexattr_t attr;
+    ASSERT_EQ(0, pthread_mutexattr_init(&attr));
+    ASSERT_EQ(0, pthread_mutexattr_settype(&attr, mutex_type));
+    ASSERT_EQ(0, pthread_mutex_init(&lock, &attr));
+    ASSERT_EQ(0, pthread_mutexattr_destroy(&attr));
+  }
+
+  void destroy() {
+    ASSERT_EQ(0, pthread_mutex_destroy(&lock));
+  }
+
+  DISALLOW_COPY_AND_ASSIGN(PthreadMutex);
+};
 
 TEST(pthread, pthread_mutex_lock_NORMAL) {
-  pthread_mutex_t lock;
-  CreateMutex(lock, PTHREAD_MUTEX_NORMAL);
+  PthreadMutex m(PTHREAD_MUTEX_NORMAL);
 
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_destroy(&lock));
+  ASSERT_EQ(0, pthread_mutex_lock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
 }
 
 TEST(pthread, pthread_mutex_lock_ERRORCHECK) {
-  pthread_mutex_t lock;
-  CreateMutex(lock, PTHREAD_MUTEX_ERRORCHECK);
+  PthreadMutex m(PTHREAD_MUTEX_ERRORCHECK);
 
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(EDEADLK, pthread_mutex_lock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_trylock(&lock));
-  ASSERT_EQ(EBUSY, pthread_mutex_trylock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(EPERM, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_destroy(&lock));
+  ASSERT_EQ(0, pthread_mutex_lock(&m.lock));
+  ASSERT_EQ(EDEADLK, pthread_mutex_lock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_trylock(&m.lock));
+  ASSERT_EQ(EBUSY, pthread_mutex_trylock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
+  ASSERT_EQ(EPERM, pthread_mutex_unlock(&m.lock));
 }
 
 TEST(pthread, pthread_mutex_lock_RECURSIVE) {
-  pthread_mutex_t lock;
-  CreateMutex(lock, PTHREAD_MUTEX_RECURSIVE);
+  PthreadMutex m(PTHREAD_MUTEX_RECURSIVE);
 
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(0, pthread_mutex_lock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_trylock(&lock));
-  ASSERT_EQ(0, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(EPERM, pthread_mutex_unlock(&lock));
-  ASSERT_EQ(0, pthread_mutex_destroy(&lock));
+  ASSERT_EQ(0, pthread_mutex_lock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_lock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_trylock(&m.lock));
+  ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
+  ASSERT_EQ(EPERM, pthread_mutex_unlock(&m.lock));
+}
+
+TEST(pthread, pthread_mutex_init_same_as_static_initializers) {
+  pthread_mutex_t lock_normal = PTHREAD_MUTEX_INITIALIZER;
+  PthreadMutex m1(PTHREAD_MUTEX_NORMAL);
+  ASSERT_EQ(0, memcmp(&lock_normal, &m1.lock, sizeof(pthread_mutex_t)));
+  pthread_mutex_destroy(&lock_normal);
+
+  pthread_mutex_t lock_errorcheck = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP;
+  PthreadMutex m2(PTHREAD_MUTEX_ERRORCHECK);
+  ASSERT_EQ(0, memcmp(&lock_errorcheck, &m2.lock, sizeof(pthread_mutex_t)));
+  pthread_mutex_destroy(&lock_errorcheck);
+
+  pthread_mutex_t lock_recursive = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+  PthreadMutex m3(PTHREAD_MUTEX_RECURSIVE);
+  ASSERT_EQ(0, memcmp(&lock_recursive, &m3.lock, sizeof(pthread_mutex_t)));
+  ASSERT_EQ(0, pthread_mutex_destroy(&lock_recursive));
 }
 
 class MutexWakeupHelper {
  private:
-  pthread_mutex_t mutex;
+  PthreadMutex m;
   enum Progress {
     LOCK_INITIALIZED,
     LOCK_WAITING,
@@ -1249,17 +1280,19 @@
     ASSERT_EQ(LOCK_INITIALIZED, helper->progress);
     helper->progress = LOCK_WAITING;
 
-    ASSERT_EQ(0, pthread_mutex_lock(&helper->mutex));
+    ASSERT_EQ(0, pthread_mutex_lock(&helper->m.lock));
     ASSERT_EQ(LOCK_RELEASED, helper->progress);
-    ASSERT_EQ(0, pthread_mutex_unlock(&helper->mutex));
+    ASSERT_EQ(0, pthread_mutex_unlock(&helper->m.lock));
 
     helper->progress = LOCK_ACCESSED;
   }
 
  public:
-  void test(int mutex_type) {
-    CreateMutex(mutex, mutex_type);
-    ASSERT_EQ(0, pthread_mutex_lock(&mutex));
+  MutexWakeupHelper(int mutex_type) : m(mutex_type) {
+  }
+
+  void test() {
+    ASSERT_EQ(0, pthread_mutex_lock(&m.lock));
     progress = LOCK_INITIALIZED;
 
     pthread_t thread;
@@ -1271,27 +1304,26 @@
     }
     usleep(5000);
     progress = LOCK_RELEASED;
-    ASSERT_EQ(0, pthread_mutex_unlock(&mutex));
+    ASSERT_EQ(0, pthread_mutex_unlock(&m.lock));
 
     ASSERT_EQ(0, pthread_join(thread, NULL));
     ASSERT_EQ(LOCK_ACCESSED, progress);
-    ASSERT_EQ(0, pthread_mutex_destroy(&mutex));
   }
 };
 
 TEST(pthread, pthread_mutex_NORMAL_wakeup) {
-  MutexWakeupHelper helper;
-  helper.test(PTHREAD_MUTEX_NORMAL);
+  MutexWakeupHelper helper(PTHREAD_MUTEX_NORMAL);
+  helper.test();
 }
 
 TEST(pthread, pthread_mutex_ERRORCHECK_wakeup) {
-  MutexWakeupHelper helper;
-  helper.test(PTHREAD_MUTEX_ERRORCHECK);
+  MutexWakeupHelper helper(PTHREAD_MUTEX_ERRORCHECK);
+  helper.test();
 }
 
 TEST(pthread, pthread_mutex_RECURSIVE_wakeup) {
-  MutexWakeupHelper helper;
-  helper.test(PTHREAD_MUTEX_RECURSIVE);
+  MutexWakeupHelper helper(PTHREAD_MUTEX_RECURSIVE);
+  helper.test();
 }
 
 TEST(pthread, pthread_mutex_owner_tid_limit) {
diff --git a/tools/relocation_packer/README.TXT b/tools/relocation_packer/README.TXT
deleted file mode 100644
index 071ab5d..0000000
--- a/tools/relocation_packer/README.TXT
+++ /dev/null
@@ -1,135 +0,0 @@
-Introduction:
--------------
-
-Relative relocations are the bulk of dynamic relocations (the .rel.dyn
-or .rela.dyn sections) in libchrome.<version>.so.  The ELF standard
-representation of them is wasteful.
-
-Packing uses a combination of run length encoding, delta encoding, and LEB128
-encoding to store them more efficiently.  Packed relocations are placed in
-a new .android.rel.dyn or .android.rela.dyn section.  Packing reduces
-the footprint of libchrome.<version>.so in the filesystem, in APK downloads,
-and in memory when loaded on the device.
-
-A packed libchrome.<version>.so is designed so that it can be loaded directly
-on Android, but requires the explicit support of a crazy linker that has been
-extended to understand packed relocations.  Packed relocations are currently
-only supported on ARM.
-
-A packed libchrome.<version>.so cannot currently be used with the standard
-Android runtime linker.
-
-See src/*.h for design and implementation notes.
-
-
-Notes:
-------
-
-Packing does not adjust debug data.  An unstripped libchrome.<version>.so
-can be packed and will run, but may no longer be useful for debugging.
-
-Unpacking on the device requires the explicit support of an extended crazy
-linker.  Adds the following new .dynamic tags, used by the crazy linker to
-find the packed .android.rel.dyn or .android.rela.dyn section data:
-
-  DT_ANDROID_REL_OFFSET = DT_LOOS    (Operating System specific: 0x6000000d)
-    - The offset of packed relocation data in libchrome.<version>.so
-  DT_ANDROID_REL_SIZE = DT_LOOS + 1  (Operating System Specific: 0x6000000e)
-    - The size of packed relocation data in bytes
-
-32 bit ARM libraries use relocations without addends.  64 bit ARM libraries
-use relocations with addends.  The packing strategy necessarily differs for
-the two relocation types.
-
-Where libchrome.<version>.so contains relocations without addends, the format
-of .android.rel.dyn data is:
-
-  "APR1" identifier
-  N: the number of count-delta pairs in the encoding
-  A: the initial offset
-  N * C,D: N count-delta pairs
-
-Where libchrome.<version>.so contains relocations with addends, the format
-of .android.rela.dyn data is:
-
-  "APA1" identifier
-  N: the number of addr-addend delta pairs in the encoding
-  N * A,V: N addr-addend delta pairs
-
-All numbers in the encoding stream are stored as LEB128 values.  For details
-see http://en.wikipedia.org/wiki/LEB128.
-
-The streaming unpacking algorithm for 32 bit ARM is:
-
-  skip over "APR1"
-  pairs, addr = next leb128 value, next leb128 value
-  emit R_ARM_RELATIVE relocation with r_offset = addr
-  while pairs:
-    count, delta = next leb128 value, next leb128 value
-    while count:
-      addr += delta
-      emit R_ARM_RELATIVE relocation with r_offset = addr
-      count--
-    pairs--
-
-The streaming unpacking algorithm for 64 bit ARM is:
-
-  skip over "APA1"
-  pairs = next signed leb128 value
-  addr, addend = 0, 0
-  while pairs:
-    addr += next signed leb128 value
-    addend += next signed leb128 value
-    emit R_AARCH64_RELATIVE relocation with r_offset = addr, r_addend = addend
-    pairs--
-
-
-Usage instructions:
--------------------
-
-To pack relocations, add an empty .android.rel.dyn or .android.rela.dyn and
-then run the tool:
-
-    echo -n 'NULL' >/tmp/small
-    if file libchrome.<version>.so | grep -q 'ELF 32'; then
-      arm-linux-androideabi-objcopy
-          --add-section .android.rel.dyn=/tmp/small
-          libchrome.<version>.so libchrome.<version>.so.packed
-    else
-      aarch64-linux-android-objcopy
-          --add-section .android.rela.dyn=/tmp/small
-          libchrome.<version>.so libchrome.<version>.so.packed
-    fi
-    rm /tmp/small
-    relocation_packer libchrome.<version>.so.packed
-
-To unpack and restore the shared library to its original state:
-
-    cp libchrome.<version>.so.packed unpackable
-    relocation_packer -u unpackable
-    if file libchrome.<version>.so | grep -q 'ELF 32'; then
-      arm-linux-androideabi-objcopy \
-          --remove-section=.android.rel.dyn unpackable libchrome.<version>.so
-    else
-      aarch64-linux-android-objcopy \
-          --remove-section=.android.rela.dyn unpackable libchrome.<version>.so
-    endif
-    rm unpackable
-
-
-Bugs & TODOs:
--------------
-
-Requires two free slots in the .dynamic section.  Uses these to add data that
-tells the crazy linker where to find the packed relocation data.  Fails
-if insufficient free slots exist (use gold --spare-dynamic-slots to increase
-the allocation).
-
-Requires libelf 0.158 or later.  Earlier libelf releases may be buggy in
-ways that prevent the packer from working correctly.
-
-
-Testing:
---------
-
-Unittests run under gtest, on the host system.
diff --git a/tools/relocation_packer/src/elf_file.h b/tools/relocation_packer/src/elf_file.h
index 73c3192..a749d50 100644
--- a/tools/relocation_packer/src/elf_file.h
+++ b/tools/relocation_packer/src/elf_file.h
@@ -4,53 +4,16 @@
 
 // ELF shared object file updates handler.
 //
-// Provides functions to remove relative relocations from the .rel.dyn
-// or .rela.dyn sections and pack into .android.rel.dyn or .android.rela.dyn,
-// and unpack to return the file to its pre-packed state.
-//
-// Files to be packed or unpacked must include an existing .android.rel.dyn
-// or android.rela.dyn section.  A standard libchrome.<version>.so will not
-// contain this section, so the following can be used to add one:
-//
-//   echo -n 'NULL' >/tmp/small
-//   if file libchrome.<version>.so | grep -q 'ELF 32'; then
-//     arm-linux-androideabi-objcopy
-//         --add-section .android.rel.dyn=/tmp/small
-//         libchrome.<version>.so libchrome.<version>.so.packed
-//   else
-//     aarch64-linux-android-objcopy
-//         --add-section .android.rela.dyn=/tmp/small
-//         libchrome.<version>.so libchrome.<version>.so.packed
-//   fi
-//   rm /tmp/small
-//
-// To use, open the file and pass the file descriptor to the constructor,
-// then pack or unpack as desired.  Packing or unpacking will flush the file
-// descriptor on success.  Example:
-//
-//   int fd = open(..., O_RDWR);
-//   ElfFile elf_file(fd);
-//   bool status;
-//   if (is_packing)
-//     status = elf_file.PackRelocations();
-//   else
-//     status = elf_file.UnpackRelocations();
-//   close(fd);
+// Provides functions to pack relocations in the .rel.dyn or .rela.dyn
+// sections, and unpack to return the file to its pre-packed state.
 //
 // SetPadding() causes PackRelocations() to pad .rel.dyn or .rela.dyn with
 // NONE-type entries rather than cutting a hole out of the shared object
 // file.  This keeps all load addresses and offsets constant, and enables
 // easier debugging and testing.
 //
-// A packed shared object file has all of its relative relocations
-// removed from .rel.dyn or .rela.dyn, and replaced as packed data in
-// .android.rel.dyn or .android.rela.dyn respectively.  The resulting file
-// is shorter than its non-packed original.
-//
-// Unpacking a packed file restores the file to its non-packed state, by
-// expanding the packed data in .android.rel.dyn or .android.rela.dyn,
-// combining the relative relocations with the data already in .rel.dyn
-// or .rela.dyn, and then writing back the now expanded section.
+// A packed shared object file is shorter than its non-packed original.
+// Unpacking a packed file restores the file to its non-packed state.
 
 #ifndef TOOLS_RELOCATION_PACKER_SRC_ELF_FILE_H_
 #define TOOLS_RELOCATION_PACKER_SRC_ELF_FILE_H_
diff --git a/tools/relocation_packer/src/leb128.h b/tools/relocation_packer/src/leb128.h
index 2c5b5d0..67fc4b8 100644
--- a/tools/relocation_packer/src/leb128.h
+++ b/tools/relocation_packer/src/leb128.h
@@ -4,9 +4,8 @@
 
 // LEB128 encoder and decoder for packed relative relocations.
 //
-// Run-length encoded relative relocations consist of a large number
-// of pairs of relatively small positive integer values.  Encoding these as
-// LEB128 saves space.
+// Packed relocations consist of a large number of relatively small
+// integer values.  Encoding these as LEB128 saves space.
 //
 // For more on LEB128 see http://en.wikipedia.org/wiki/LEB128.
 
diff --git a/tools/relocation_packer/src/main.cc b/tools/relocation_packer/src/main.cc
index 3f784e4..8e9de6d 100644
--- a/tools/relocation_packer/src/main.cc
+++ b/tools/relocation_packer/src/main.cc
@@ -4,9 +4,6 @@
 
 // Tool to pack and unpack relative relocations in a shared library.
 //
-// Packing removes relative relocations from .rel.dyn and writes them
-// in a more compact form to .android.rel.dyn.  Unpacking does the reverse.
-//
 // Invoke with -v to trace actions taken when packing or unpacking.
 // Invoke with -p to pad removed relocations with R_*_NONE.  Suppresses
 // shrinking of .rel.dyn.
diff --git a/tools/relocation_packer/src/packer.h b/tools/relocation_packer/src/packer.h
index 8a57e62..63f50e2 100644
--- a/tools/relocation_packer/src/packer.h
+++ b/tools/relocation_packer/src/packer.h
@@ -3,43 +3,6 @@
 // found in the LICENSE file.
 
 // Pack relative relocations into a more compact form.
-//
-//
-// For relative relocations without addends (32 bit platforms)
-// -----------------------------------------------------------
-//
-// Applies two packing strategies.  The first is run-length encoding, which
-// turns a large set of relative relocations into a much smaller set
-// of delta-count pairs, prefixed with a two-word header comprising the
-// count of pairs and the initial relocation offset.  The second is LEB128
-// encoding, which compresses the result of run-length encoding.
-//
-// Once packed, data is prefixed by an identifier that allows for any later
-// versioning of packing strategies.
-//
-// A complete packed stream of relocations without addends might look
-// something like:
-//
-//   "APR1"   pairs  init_offset count1 delta1 count2 delta2 ...
-//   41505231 f2b003 b08ac716    e001   04     01     10     ...
-//
-//
-// For relative relocations with addends (64 bit platforms)
-// --------------------------------------------------------
-//
-// Applies two packing strategies.  The first is delta encoding, which
-// turns a large set of relative relocations into a smaller set
-// of offset and addend delta pairs, prefixed with a header indicating the
-// count of pairs.  The second is signed LEB128 encoding, which compacts
-// the result of delta encoding.
-//
-// Once packed, data is prefixed by an identifier that allows for any later
-// versioning of packing strategies.
-//
-// A complete packed stream might look something like:
-//
-//   "APA1"   pairs  offset_d1 addend_d1 offset_d2 addend_d2 ...
-//   41505232 f2b018 04        28        08        9f01      ...
 
 #ifndef TOOLS_RELOCATION_PACKER_SRC_PACKER_H_
 #define TOOLS_RELOCATION_PACKER_SRC_PACKER_H_
diff --git a/tools/relocation_packer/src/run_length_encoder.h b/tools/relocation_packer/src/run_length_encoder.h
deleted file mode 100644
index f3a80e6..0000000
--- a/tools/relocation_packer/src/run_length_encoder.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Run-length encode and decode relative relocations.
-//
-// Relative relocations are the bulk of dynamic relocations (the
-// .rel.dyn or .rela.dyn sections) in libchrome.<version>.so, and the ELF
-// standard representation of them is wasteful.  .rel.dyn contains
-// relocations without addends, .rela.dyn relocations with addends.
-//
-// A relocation with no addend is 8 bytes on 32 bit platforms and 16 bytes
-// on 64 bit plaforms, split into offset and info fields.  Offsets strictly
-// increase, and each is commonly a few bytes different from its predecessor.
-// There are long runs where the difference does not change.  The info field
-// is constant.  Example, from 'readelf -x4 libchrome.<version>.so' 32 bit:
-//
-//   offset   info     offset   info
-//   808fef01 17000000 848fef01 17000000 ................
-//   888fef01 17000000 8c8fef01 17000000 ................
-//   908fef01 17000000 948fef01 17000000 ................
-//
-// Run length encoding packs this data more efficiently, by representing it
-// as a delta and a count of entries each differing from its predecessor
-// by this delta.  The above can be represented as a start address followed
-// by an encoded count of 6 and offset difference of 4:
-//
-//   start    count    diff
-//   01ef8f80 00000006 00000004
-//
-// Because relative relocation offsets strictly increase, the complete
-// set of relative relocations in libchrome.<version>.so can be
-// represented by a single start address followed by one or more difference
-// and count encoded word pairs:
-//
-//   start    run1 count run1 diff  run2 count run2 diff
-//   01ef8f80 00000006   00000004   00000010   00000008 ...
-//
-// Decoding regenerates relative relocations beginning at address
-// 'start' and for each encoded run, incrementing the address by 'difference'
-// for 'count' iterations and emitting a new relative relocation.
-//
-// Once encoded, data is prefixed by a single word count of packed delta and
-// count pairs.  A final run-length encoded relative relocations vector
-// might therefore look something like:
-//
-//   pairs    start    run 1             run 2             ... run 15
-//   0000000f 01ef8f80 00000006 00000004 00000010 00000008 ...
-// Interpreted as:
-//   pairs=15 start=.. count=6,delta=4   count=16,delta=8
-
-#ifndef TOOLS_RELOCATION_PACKER_SRC_RUN_LENGTH_ENCODER_H_
-#define TOOLS_RELOCATION_PACKER_SRC_RUN_LENGTH_ENCODER_H_
-
-#include <vector>
-
-#include "elf.h"
-#include "elf_traits.h"
-
-namespace relocation_packer {
-
-// A RelocationRunLengthCodec packs vectors of relative relocations
-// into more compact forms, and unpacks them to reproduce the pre-packed data.
-class RelocationRunLengthCodec {
- public:
-  // Encode relative relocations into a more compact form.
-  // |relocations| is a vector of relative relocation structs.
-  // |packed| is the vector of packed words into which relocations are packed.
-  static void Encode(const std::vector<ELF::Rel>& relocations,
-                     std::vector<ELF::Xword>* packed);
-
-  // Decode relative relocations from their more compact form.
-  // |packed| is the vector of packed relocations.
-  // |relocations| is a vector of unpacked relative relocation structs.
-  static void Decode(const std::vector<ELF::Xword>& packed,
-                     std::vector<ELF::Rel>* relocations);
-};
-
-}  // namespace relocation_packer
-
-#endif  // TOOLS_RELOCATION_PACKER_SRC_RUN_LENGTH_ENCODER_H_
diff --git a/tools/relocation_packer/src/sleb128.h b/tools/relocation_packer/src/sleb128.h
index fa0a246..3a63f66 100644
--- a/tools/relocation_packer/src/sleb128.h
+++ b/tools/relocation_packer/src/sleb128.h
@@ -4,9 +4,8 @@
 
 // SLEB128 encoder and decoder for packed relative relocations.
 //
-// Delta encoded relative relocations consist of a large number
-// of pairs signed integer values, many with small values.  Encoding these
-// as signed LEB128 saves space.
+// Packed relocations consist of a large number of relatively small
+// integer values.  Encoding these as LEB128 saves space.
 //
 // For more on LEB128 see http://en.wikipedia.org/wiki/LEB128.
 
diff --git a/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.py b/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.py
deleted file mode 100755
index e71b5cb..0000000
--- a/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-"""Build relocation packer unit test data.
-
-Uses a built relocation packer to generate 'golden' reference test data
-files for elf_file_unittests.cc.
-"""
-
-import optparse
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-
-def PackArmLibraryRelocations(android_pack_relocations,
-                              android_objcopy,
-                              added_section,
-                              input_path,
-                              output_path):
-  # Copy and add a 'NULL' .android.rel.dyn section for the packing tool.
-  with tempfile.NamedTemporaryFile() as stream:
-    stream.write('NULL')
-    stream.flush()
-    objcopy_command = [android_objcopy,
-                       '--add-section', '%s=%s' % (added_section, stream.name),
-                       input_path, output_path]
-    subprocess.check_call(objcopy_command)
-
-  # Pack relocations.
-  pack_command = [android_pack_relocations, output_path]
-  subprocess.check_call(pack_command)
-
-
-def UnpackArmLibraryRelocations(android_pack_relocations,
-                                input_path,
-                                output_path):
-  shutil.copy(input_path, output_path)
-
-  # Unpack relocations.  We leave the .android.rel.dyn or .android.rela.dyn
-  # in place.
-  unpack_command = [android_pack_relocations, '-u', output_path]
-  subprocess.check_call(unpack_command)
-
-
-def main():
-  parser = optparse.OptionParser()
-
-  parser.add_option('--android-pack-relocations',
-      help='Path to the ARM relocations packer binary')
-  parser.add_option('--android-objcopy',
-      help='Path to the toolchain\'s objcopy binary')
-  parser.add_option('--added-section',
-      choices=['.android.rel.dyn', '.android.rela.dyn'],
-      help='Section to add, one of ".android.rel.dyn" or ".android.rela.dyn"')
-  parser.add_option('--test-file',
-      help='Path to the input test file, an unpacked ARM .so')
-  parser.add_option('--unpacked-output',
-      help='Path to the output file for reference unpacked data')
-  parser.add_option('--packed-output',
-      help='Path to the output file for reference packed data')
-
-  options, _ = parser.parse_args()
-
-  for output in [options.unpacked_output, options.packed_output]:
-    directory = os.path.dirname(output)
-    if not os.path.exists(directory):
-      os.makedirs(directory)
-
-  PackArmLibraryRelocations(options.android_pack_relocations,
-                            options.android_objcopy,
-                            options.added_section,
-                            options.test_file,
-                            options.packed_output)
-
-  UnpackArmLibraryRelocations(options.android_pack_relocations,
-                              options.packed_output,
-                              options.unpacked_output)
-
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.sh b/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.sh
deleted file mode 100755
index f90a2f6..0000000
--- a/tools/relocation_packer/test_data/generate_elf_file_unittest_relocs.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-# Generates elf_file_unittest_relocs_arm{32,64}{,_packed}.so test data files
-# from elf_file_unittest_relocs.cc.  Run once to create these test data
-# files; the files are checked into the source tree.
-#
-# To use:
-#   ./generate_elf_file_unittest_relocs.sh
-#   git add elf_file_unittest_relocs_arm{32,64}{,_packed}.so
-
-function main() {
-  local '-r' test_data_directory="$(pwd)"
-  cd '../../..'
-
-  source tools/cr/cr-bash-helpers.sh
-  local arch
-  for arch in 'arm32' 'arm64'; do
-    cr 'init' '--platform=android' '--type=Debug' '--architecture='"${arch}"
-    cr 'build' 'relocation_packer_unittests_test_data'
-  done
-
-  local '-r' packer='out_android/Debug/obj/tools/relocation_packer'
-  local '-r' gen="${packer}/relocation_packer_unittests_test_data.gen"
-
-  cp "${gen}/elf_file_unittest_relocs_arm"{32,64}{,_packed}'.so' \
-     "${test_data_directory}"
-
-  return 0
-}
-
-main