Merge "Add ANDROID_DLEXT_FORCE_LOAD flag"
diff --git a/libc/Android.mk b/libc/Android.mk
index 0de0fb2..e632ee7 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -1384,7 +1384,8 @@
 # We'd really like to do this for all architectures, but since this wasn't done
 # before, these symbols must continue to be exported on LP32 for binary
 # compatibility.
-LOCAL_LDFLAGS_64 := -Wl,--exclude-libs,libgcc.a
+# TODO: disabled for http://b/20065774.
+#LOCAL_LDFLAGS_64 := -Wl,--exclude-libs,libgcc.a
 
 # TODO: This is to work around b/19059885. Remove after root cause is fixed
 LOCAL_LDFLAGS_arm := -Wl,--hash-style=sysv
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 470a038..6a2f313 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -40,6 +40,8 @@
     arch-arm64/bionic/syscall.S \
     arch-arm64/bionic/vfork.S \
 
+# Work around for http://b/20065774.
+libc_bionic_src_files_arm64 += arch-arm64/bionic/libgcc_compat.c
 
 libc_crt_target_cflags_arm64 := \
     -I$(LOCAL_PATH)/arch-arm64/include
diff --git a/libc/arch-arm64/bionic/libgcc_compat.c b/libc/arch-arm64/bionic/libgcc_compat.c
new file mode 100644
index 0000000..35158ce
--- /dev/null
+++ b/libc/arch-arm64/bionic/libgcc_compat.c
@@ -0,0 +1,11 @@
+/* STOPSHIP: remove this once the flounder blobs have been rebuilt (http://b/20065774). */
+
+extern void __clear_cache(char*, char*);
+extern char _Unwind_Backtrace;
+extern char _Unwind_GetIP;
+
+void* __bionic_libgcc_compat_symbols[] = {
+    &__clear_cache,
+    &_Unwind_Backtrace,
+    &_Unwind_GetIP,
+};
diff --git a/libc/arch-mips/string/memset.S b/libc/arch-mips/string/memset.S
index 3e630ca..09b756b 100644
--- a/libc/arch-mips/string/memset.S
+++ b/libc/arch-mips/string/memset.S
@@ -67,86 +67,6 @@
 #define DBG
 #endif
 
-/*
- * void _memset16(uint16_t* dst, uint16_t value, size_t size);
- */
-
-LEAF(_memset16,0)
-	.set noreorder
-DBG	/* Check parameters */
-DBG	andi	t0,a0,1			# a0 must be halfword aligned
-DBG	tne	t0,zero
-DBG	andi	t2,a2,1			# a2 must be even
-DBG	tne	t2,zero
-
-#ifdef FIXARGS
-	# ensure count is even
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	a2,zero,0,1
-#else
-	ori	a2,1
-	xori	a2,1
-#endif
-#endif
-
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	a1,a1,16,16
-#else
-	andi	a1,0xffff
-	sll	t3,a1,16
-	or	a1,t3
-#endif
-
-	beqz	a2,.Ldone
-	 andi	t1,a0,2
-	beqz	t1,.Lalignok
-	 addu	t0,a0,a2		# t0 is the "past the end" address
-	sh	a1,0(a0)		# store one halfword to get aligned
-	addu	a0,2
-	subu	a2,2
-.Lalignok:
-	slti	t1,a2,4			# .Laligned for 4 or more bytes
-	beqz	t1,.Laligned
-	 sne	t1,a2,2			# one more halfword?
-	bnez	t1,.Ldone
-	 nop
-	sh	a1,0(a0)
-.Ldone:
-	j	ra
-	 nop
-	.set reorder
-END(_memset16)
-
-/*
- * void _memset32(uint32_t* dst, uint32_t value, size_t size);
- */
-
-LEAF(_memset32,0)
-	.set noreorder
-DBG	/* Check parameters */
-DBG	andi	t0,a0,3			# a0 must be word aligned
-DBG	tne	t0,zero
-DBG	andi	t2,a2,3			# a2 must be a multiple of 4 bytes
-DBG	tne	t2,zero
-
-#ifdef FIXARGS
-	# ensure count is a multiple of 4
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	$a2,$0,0,2
-#else
-	ori	a2,3
-	xori	a2,3
-#endif
-#endif
-
-	bnez	a2,.Laligned		# any work to do?
-	 addu	t0,a0,a2		# t0 is the "past the end" address
-
-	j	ra
-	 nop
-	.set reorder
-END(_memset32)
-
 LEAF(memset,0)
 
 	.set	noreorder
diff --git a/libc/arch-mips64/string/memset.S b/libc/arch-mips64/string/memset.S
index 3e630ca..09b756b 100644
--- a/libc/arch-mips64/string/memset.S
+++ b/libc/arch-mips64/string/memset.S
@@ -67,86 +67,6 @@
 #define DBG
 #endif
 
-/*
- * void _memset16(uint16_t* dst, uint16_t value, size_t size);
- */
-
-LEAF(_memset16,0)
-	.set noreorder
-DBG	/* Check parameters */
-DBG	andi	t0,a0,1			# a0 must be halfword aligned
-DBG	tne	t0,zero
-DBG	andi	t2,a2,1			# a2 must be even
-DBG	tne	t2,zero
-
-#ifdef FIXARGS
-	# ensure count is even
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	a2,zero,0,1
-#else
-	ori	a2,1
-	xori	a2,1
-#endif
-#endif
-
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	a1,a1,16,16
-#else
-	andi	a1,0xffff
-	sll	t3,a1,16
-	or	a1,t3
-#endif
-
-	beqz	a2,.Ldone
-	 andi	t1,a0,2
-	beqz	t1,.Lalignok
-	 addu	t0,a0,a2		# t0 is the "past the end" address
-	sh	a1,0(a0)		# store one halfword to get aligned
-	addu	a0,2
-	subu	a2,2
-.Lalignok:
-	slti	t1,a2,4			# .Laligned for 4 or more bytes
-	beqz	t1,.Laligned
-	 sne	t1,a2,2			# one more halfword?
-	bnez	t1,.Ldone
-	 nop
-	sh	a1,0(a0)
-.Ldone:
-	j	ra
-	 nop
-	.set reorder
-END(_memset16)
-
-/*
- * void _memset32(uint32_t* dst, uint32_t value, size_t size);
- */
-
-LEAF(_memset32,0)
-	.set noreorder
-DBG	/* Check parameters */
-DBG	andi	t0,a0,3			# a0 must be word aligned
-DBG	tne	t0,zero
-DBG	andi	t2,a2,3			# a2 must be a multiple of 4 bytes
-DBG	tne	t2,zero
-
-#ifdef FIXARGS
-	# ensure count is a multiple of 4
-#if (__mips==32) && (__mips_isa_rev>=2)
-	ins	$a2,$0,0,2
-#else
-	ori	a2,3
-	xori	a2,3
-#endif
-#endif
-
-	bnez	a2,.Laligned		# any work to do?
-	 addu	t0,a0,a2		# t0 is the "past the end" address
-
-	j	ra
-	 nop
-	.set reorder
-END(_memset32)
-
 LEAF(memset,0)
 
 	.set	noreorder
diff --git a/libc/bionic/pthread_mutex.cpp b/libc/bionic/pthread_mutex.cpp
index d2ff1ae..5bdc5ed 100644
--- a/libc/bionic/pthread_mutex.cpp
+++ b/libc/bionic/pthread_mutex.cpp
@@ -44,14 +44,85 @@
 #include "private/bionic_time_conversions.h"
 #include "private/bionic_tls.h"
 
-/* a mutex is implemented as a 32-bit integer holding the following fields
+/* a mutex attribute holds the following fields
+ *
+ * bits:     name       description
+ * 0-3       type       type of mutex
+ * 4         shared     process-shared flag
+ */
+#define  MUTEXATTR_TYPE_MASK   0x000f
+#define  MUTEXATTR_SHARED_MASK 0x0010
+
+int pthread_mutexattr_init(pthread_mutexattr_t *attr)
+{
+    *attr = PTHREAD_MUTEX_DEFAULT;
+    return 0;
+}
+
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
+{
+    *attr = -1;
+    return 0;
+}
+
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *attr, int *type_p)
+{
+    int type = (*attr & MUTEXATTR_TYPE_MASK);
+
+    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK) {
+        return EINVAL;
+    }
+
+    *type_p = type;
+    return 0;
+}
+
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
+{
+    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK ) {
+        return EINVAL;
+    }
+
+    *attr = (*attr & ~MUTEXATTR_TYPE_MASK) | type;
+    return 0;
+}
+
+/* process-shared mutexes are not supported at the moment */
+
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int  pshared)
+{
+    switch (pshared) {
+    case PTHREAD_PROCESS_PRIVATE:
+        *attr &= ~MUTEXATTR_SHARED_MASK;
+        return 0;
+
+    case PTHREAD_PROCESS_SHARED:
+        /* our current implementation of pthread actually supports shared
+         * mutexes but won't cleanup if a process dies with the mutex held.
+         * Nevertheless, it's better than nothing. Shared mutexes are used
+         * by surfaceflinger and audioflinger.
+         */
+        *attr |= MUTEXATTR_SHARED_MASK;
+        return 0;
+    }
+    return EINVAL;
+}
+
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t* attr, int* pshared) {
+    *pshared = (*attr & MUTEXATTR_SHARED_MASK) ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
+    return 0;
+}
+
+/* a mutex contains a state value and a owner_tid.
+ * The value is implemented as a 16-bit integer holding the following fields:
  *
  * bits:     name     description
- * 31-16     tid      owner thread's tid (recursive and errorcheck only)
  * 15-14     type     mutex type
  * 13        shared   process-shared flag
  * 12-2      counter  counter of recursive mutexes
  * 1-0       state    lock state (0, 1 or 2)
+ *
+ * The owner_tid is used only in recursive and errorcheck mutex to hold the mutex owner thread tid.
  */
 
 /* Convenience macro, creates a mask of 'bits' bits that starts from
@@ -68,6 +139,12 @@
 /* And this one does the opposite, i.e. extract a field's value from a bit pattern */
 #define  FIELD_FROM_BITS(val,shift,bits)  (((val) >> (shift)) & ((1 << (bits))-1))
 
+
+/* Convenience macros.
+ *
+ * These are used to form or modify the bit pattern of a given mutex value
+ */
+
 /* Mutex state:
  *
  * 0 for unlocked
@@ -135,102 +212,16 @@
 #define  MUTEX_TYPE_BITS_RECURSIVE   MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_RECURSIVE)
 #define  MUTEX_TYPE_BITS_ERRORCHECK  MUTEX_TYPE_TO_BITS(PTHREAD_MUTEX_ERRORCHECK)
 
-/* Mutex owner field:
- *
- * This is only used for recursive and errorcheck mutexes. It holds the
- * tid of the owning thread. We use 16 bits to represent tid here,
- * so the highest tid is 65535. There is a test to check /proc/sys/kernel/pid_max
- * to make sure it will not exceed our limit.
- */
-#define  MUTEX_OWNER_SHIFT     16
-#define  MUTEX_OWNER_LEN       16
-
-#define  MUTEX_OWNER_FROM_BITS(v)    FIELD_FROM_BITS(v,MUTEX_OWNER_SHIFT,MUTEX_OWNER_LEN)
-#define  MUTEX_OWNER_TO_BITS(v)      FIELD_TO_BITS(v,MUTEX_OWNER_SHIFT,MUTEX_OWNER_LEN)
-
-/* Convenience macros.
- *
- * These are used to form or modify the bit pattern of a given mutex value
- */
-
-
-
-/* a mutex attribute holds the following fields
- *
- * bits:     name       description
- * 0-3       type       type of mutex
- * 4         shared     process-shared flag
- */
-#define  MUTEXATTR_TYPE_MASK   0x000f
-#define  MUTEXATTR_SHARED_MASK 0x0010
-
-
-int pthread_mutexattr_init(pthread_mutexattr_t *attr)
-{
-    *attr = PTHREAD_MUTEX_DEFAULT;
-    return 0;
-}
-
-int pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
-{
-    *attr = -1;
-    return 0;
-}
-
-int pthread_mutexattr_gettype(const pthread_mutexattr_t *attr, int *type_p)
-{
-    int type = (*attr & MUTEXATTR_TYPE_MASK);
-
-    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK) {
-        return EINVAL;
-    }
-
-    *type_p = type;
-    return 0;
-}
-
-int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
-{
-    if (type < PTHREAD_MUTEX_NORMAL || type > PTHREAD_MUTEX_ERRORCHECK ) {
-        return EINVAL;
-    }
-
-    *attr = (*attr & ~MUTEXATTR_TYPE_MASK) | type;
-    return 0;
-}
-
-/* process-shared mutexes are not supported at the moment */
-
-int pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int  pshared)
-{
-    switch (pshared) {
-    case PTHREAD_PROCESS_PRIVATE:
-        *attr &= ~MUTEXATTR_SHARED_MASK;
-        return 0;
-
-    case PTHREAD_PROCESS_SHARED:
-        /* our current implementation of pthread actually supports shared
-         * mutexes but won't cleanup if a process dies with the mutex held.
-         * Nevertheless, it's better than nothing. Shared mutexes are used
-         * by surfaceflinger and audioflinger.
-         */
-        *attr |= MUTEXATTR_SHARED_MASK;
-        return 0;
-    }
-    return EINVAL;
-}
-
-int pthread_mutexattr_getpshared(const pthread_mutexattr_t* attr, int* pshared) {
-    *pshared = (*attr & MUTEXATTR_SHARED_MASK) ? PTHREAD_PROCESS_SHARED : PTHREAD_PROCESS_PRIVATE;
-    return 0;
-}
-
 struct pthread_mutex_internal_t {
-  atomic_int state;
+  _Atomic(uint16_t) state;
 #if defined(__LP64__)
-  char __reserved[36];
+  uint16_t __pad;
+  atomic_int owner_tid;
+  char __reserved[32];
+#else
+  _Atomic(uint16_t) owner_tid;
 #endif
-};
+} __attribute__((aligned(4)));
 
 static_assert(sizeof(pthread_mutex_t) == sizeof(pthread_mutex_internal_t),
               "pthread_mutex_t should actually be pthread_mutex_internal_t in implementation.");
@@ -254,35 +245,36 @@
         return 0;
     }
 
-    int state = 0;
+    uint16_t state = 0;
     if ((*attr & MUTEXATTR_SHARED_MASK) != 0) {
         state |= MUTEX_SHARED_MASK;
     }
 
     switch (*attr & MUTEXATTR_TYPE_MASK) {
     case PTHREAD_MUTEX_NORMAL:
-        state |= MUTEX_TYPE_BITS_NORMAL;
-        break;
+      state |= MUTEX_TYPE_BITS_NORMAL;
+      break;
     case PTHREAD_MUTEX_RECURSIVE:
-        state |= MUTEX_TYPE_BITS_RECURSIVE;
-        break;
+      state |= MUTEX_TYPE_BITS_RECURSIVE;
+      break;
     case PTHREAD_MUTEX_ERRORCHECK:
-        state |= MUTEX_TYPE_BITS_ERRORCHECK;
-        break;
+      state |= MUTEX_TYPE_BITS_ERRORCHECK;
+      break;
     default:
         return EINVAL;
     }
 
     atomic_init(&mutex->state, state);
+    atomic_init(&mutex->owner_tid, 0);
     return 0;
 }
 
 static inline __always_inline int __pthread_normal_mutex_trylock(pthread_mutex_internal_t* mutex,
-                                                                 int shared) {
-    const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
+                                                                 uint16_t shared) {
+    const uint16_t unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
+    const uint16_t locked_uncontended = shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
 
-    int old_state = unlocked;
+    uint16_t old_state = unlocked;
     if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
                          locked_uncontended, memory_order_acquire, memory_order_relaxed))) {
         return 0;
@@ -303,7 +295,7 @@
  * the lock state field.
  */
 static inline __always_inline int __pthread_normal_mutex_lock(pthread_mutex_internal_t* mutex,
-                                                              int shared,
+                                                              uint16_t shared,
                                                               const timespec* abs_timeout_or_null,
                                                               clockid_t clock) {
     if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
@@ -312,8 +304,8 @@
 
     ScopedTrace trace("Contending for pthread mutex");
 
-    const int unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+    const uint16_t unlocked           = shared | MUTEX_STATE_BITS_UNLOCKED;
+    const uint16_t locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
 
     // We want to go to sleep until the mutex is available, which requires
     // promoting it to locked_contended. We need to swap in the new state
@@ -341,13 +333,13 @@
 }
 
 /*
- * Release a mutex of type NORMAL.  The caller is responsible for determining
+ * Release a normal mutex.  The caller is responsible for determining
  * that we are in fact the owner of this lock.
  */
 static inline __always_inline void __pthread_normal_mutex_unlock(pthread_mutex_internal_t* mutex,
-                                                                 int shared) {
-    const int unlocked         = shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+                                                                 uint16_t shared) {
+    const uint16_t unlocked         = shared | MUTEX_STATE_BITS_UNLOCKED;
+    const uint16_t locked_contended = shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
 
     // We use an atomic_exchange to release the lock. If locked_contended state
     // is returned, some threads is waiting for the lock and we need to wake up
@@ -385,7 +377,7 @@
  *
  */
 static inline __always_inline int __recursive_increment(pthread_mutex_internal_t* mutex,
-                                                        int old_state) {
+                                                        uint16_t old_state) {
     // Detect recursive lock overflow and return EAGAIN.
     // This is safe because only the owner thread can modify the
     // counter bits in the mutex value.
@@ -393,22 +385,18 @@
         return EAGAIN;
     }
 
-    // We own the mutex, but other threads are able to change the lower bits
-    // (e.g. promoting it to "contended"), so we need to use an atomic exchange
-    // loop to update the counter. The counter will not overflow in the loop,
-    // as only the owner thread can change it.
-    // The mutex is still locked, so we don't need a release fence.
+    // Other threads are able to change the lower bits (e.g. promoting it to "contended"),
+    // but the mutex counter will not overflow. So we use atomic_fetch_add operation here.
+    // The mutex is still locked by current thread, so we don't need a release fence.
     atomic_fetch_add_explicit(&mutex->state, MUTEX_COUNTER_BITS_ONE, memory_order_relaxed);
     return 0;
 }
 
 static int __pthread_mutex_lock_with_timeout(pthread_mutex_internal_t* mutex,
                                            const timespec* abs_timeout_or_null, clockid_t clock) {
-    int old_state, mtype, tid, shared;
-
-    old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
-    mtype = (old_state & MUTEX_TYPE_MASK);
-    shared = (old_state & MUTEX_SHARED_MASK);
+    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
+    uint16_t shared = (old_state & MUTEX_SHARED_MASK);
 
     // Handle common case first.
     if ( __predict_true(mtype == MUTEX_TYPE_BITS_NORMAL) ) {
@@ -416,26 +404,26 @@
     }
 
     // Do we already own this recursive or error-check mutex?
-    tid = __get_thread()->tid;
-    if (tid == MUTEX_OWNER_FROM_BITS(old_state)) {
+    pid_t tid = __get_thread()->tid;
+    if (tid == atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed)) {
         if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
             return EDEADLK;
         }
         return __recursive_increment(mutex, old_state);
     }
 
-    const int unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
-    const int locked_contended   = mtype | shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
+    const uint16_t unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
+    const uint16_t locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
+    const uint16_t locked_contended   = mtype | shared | MUTEX_STATE_BITS_LOCKED_CONTENDED;
 
     // First, if the mutex is unlocked, try to quickly acquire it.
     // In the optimistic case where this works, set the state to locked_uncontended.
     if (old_state == unlocked) {
-        int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
         // If exchanged successfully, an acquire fence is required to make
         // all memory accesses made by other threads visible to the current CPU.
         if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
-                             new_state, memory_order_acquire, memory_order_relaxed))) {
+                             locked_uncontended, memory_order_acquire, memory_order_relaxed))) {
+            atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
             return 0;
         }
     }
@@ -448,13 +436,13 @@
             // is contention when we are in this loop. This ensures all waiters
             // will be unlocked.
 
-            int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_contended;
             // If exchanged successfully, an acquire fence is required to make
             // all memory accesses made by other threads visible to the current CPU.
             if (__predict_true(atomic_compare_exchange_weak_explicit(&mutex->state,
-                                                                     &old_state, new_state,
+                                                                     &old_state, locked_contended,
                                                                      memory_order_acquire,
                                                                      memory_order_relaxed))) {
+                atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
                 return 0;
             }
             continue;
@@ -491,9 +479,9 @@
 int pthread_mutex_lock(pthread_mutex_t* mutex_interface) {
     pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
-    int mtype = (old_state & MUTEX_TYPE_MASK);
-    int shared = (old_state & MUTEX_SHARED_MASK);
+    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    uint16_t mtype = (old_state & MUTEX_TYPE_MASK);
+    uint16_t shared = (old_state & MUTEX_SHARED_MASK);
     // Avoid slowing down fast path of normal mutex lock operation.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
       if (__predict_true(__pthread_normal_mutex_trylock(mutex, shared) == 0)) {
@@ -506,11 +494,9 @@
 int pthread_mutex_unlock(pthread_mutex_t* mutex_interface) {
     pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int old_state, mtype, tid, shared;
-
-    old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
-    mtype  = (old_state & MUTEX_TYPE_MASK);
-    shared = (old_state & MUTEX_SHARED_MASK);
+    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    uint16_t mtype  = (old_state & MUTEX_TYPE_MASK);
+    uint16_t shared = (old_state & MUTEX_SHARED_MASK);
 
     // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
@@ -519,9 +505,10 @@
     }
 
     // Do we already own this recursive or error-check mutex?
-    tid = __get_thread()->tid;
-    if ( tid != MUTEX_OWNER_FROM_BITS(old_state) )
+    pid_t tid = __get_thread()->tid;
+    if ( tid != atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed) ) {
         return EPERM;
+    }
 
     // If the counter is > 0, we can simply decrement it atomically.
     // Since other threads can mutate the lower state bits (and only the
@@ -538,7 +525,8 @@
     // to awake.
     // A release fence is required to make previous stores visible to next
     // lock owner threads.
-    const int unlocked = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
+    atomic_store_explicit(&mutex->owner_tid, 0, memory_order_relaxed);
+    const uint16_t unlocked = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
     old_state = atomic_exchange_explicit(&mutex->state, unlocked, memory_order_release);
     if (MUTEX_STATE_BITS_IS_LOCKED_CONTENDED(old_state)) {
         __futex_wake_ex(&mutex->state, shared, 1);
@@ -550,12 +538,12 @@
 int pthread_mutex_trylock(pthread_mutex_t* mutex_interface) {
     pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
 
-    int old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
-    int mtype  = (old_state & MUTEX_TYPE_MASK);
-    int shared = (old_state & MUTEX_SHARED_MASK);
+    uint16_t old_state = atomic_load_explicit(&mutex->state, memory_order_relaxed);
+    uint16_t mtype  = (old_state & MUTEX_TYPE_MASK);
+    uint16_t shared = (old_state & MUTEX_SHARED_MASK);
 
-    const int unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
-    const int locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
+    const uint16_t unlocked           = mtype | shared | MUTEX_STATE_BITS_UNLOCKED;
+    const uint16_t locked_uncontended = mtype | shared | MUTEX_STATE_BITS_LOCKED_UNCONTENDED;
 
     // Handle common case first.
     if (__predict_true(mtype == MUTEX_TYPE_BITS_NORMAL)) {
@@ -564,7 +552,7 @@
 
     // Do we already own this recursive or error-check mutex?
     pid_t tid = __get_thread()->tid;
-    if (tid == MUTEX_OWNER_FROM_BITS(old_state)) {
+    if (tid == atomic_load_explicit(&mutex->owner_tid, memory_order_relaxed)) {
         if (mtype == MUTEX_TYPE_BITS_ERRORCHECK) {
             return EBUSY;
         }
@@ -577,10 +565,11 @@
     // If exchanged successfully, an acquire fence is required to make
     // all memory accesses made by other threads visible to the current CPU.
     old_state = unlocked;
-    int new_state = MUTEX_OWNER_TO_BITS(tid) | locked_uncontended;
-    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state, new_state,
+    if (__predict_true(atomic_compare_exchange_strong_explicit(&mutex->state, &old_state,
+                                                               locked_uncontended,
                                                                memory_order_acquire,
                                                                memory_order_relaxed))) {
+        atomic_store_explicit(&mutex->owner_tid, tid, memory_order_relaxed);
         return 0;
     }
     return EBUSY;
@@ -617,8 +606,5 @@
     if (error != 0) {
         return error;
     }
-
-    pthread_mutex_internal_t* mutex = __get_internal_mutex(mutex_interface);
-    atomic_store_explicit(&mutex->state, 0xdead10cc, memory_order_relaxed);
     return 0;
 }
diff --git a/libc/version_script.txt b/libc/version_script.txt
index afc5e5c..349a2fc 100644
--- a/libc/version_script.txt
+++ b/libc/version_script.txt
@@ -1,4 +1,9 @@
 LIBC {
+  global:
+    /* Work-around for http://b/20065774. */
+    __clear_cache;
+    _Unwind_Backtrace;
+    _Unwind_GetIP;
   local:
     _ZSt7nothrow;
     _ZdaPv;
diff --git a/linker/dlfcn.cpp b/linker/dlfcn.cpp
index 64df7a5..479e831 100644
--- a/linker/dlfcn.cpp
+++ b/linker/dlfcn.cpp
@@ -101,16 +101,11 @@
 
   soinfo* found = nullptr;
   ElfW(Sym)* sym = nullptr;
-  if (handle == RTLD_DEFAULT) {
-    sym = dlsym_linear_lookup(symbol, &found, nullptr);
-  } else if (handle == RTLD_NEXT) {
-    void* caller_addr = __builtin_return_address(0);
-    soinfo* si = find_containing_library(caller_addr);
+  void* caller_addr = __builtin_return_address(0);
+  soinfo* caller = find_containing_library(caller_addr);
 
-    sym = nullptr;
-    if (si && si->next) {
-      sym = dlsym_linear_lookup(symbol, &found, si->next);
-    }
+  if (handle == RTLD_DEFAULT || handle == RTLD_NEXT) {
+    sym = dlsym_linear_lookup(symbol, &found, caller, handle);
   } else {
     sym = dlsym_handle_lookup(reinterpret_cast<soinfo*>(handle), &found, symbol);
   }
diff --git a/linker/linker.cpp b/linker/linker.cpp
index 8703e4f..a9c2bc1 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -738,15 +738,21 @@
    beginning of the global solist. Otherwise the search starts at the
    specified soinfo (for RTLD_NEXT).
  */
-ElfW(Sym)* dlsym_linear_lookup(const char* name, soinfo** found, soinfo* start) {
+ElfW(Sym)* dlsym_linear_lookup(const char* name, soinfo** found, soinfo* caller, void* handle) {
   SymbolName symbol_name(name);
 
-  if (start == nullptr) {
-    start = solist;
+  soinfo* start = solist;
+
+  if (handle == RTLD_NEXT) {
+    if (caller == nullptr || caller->next == nullptr) {
+      return nullptr;
+    } else {
+      start = caller->next;
+    }
   }
 
   ElfW(Sym)* s = nullptr;
-  for (soinfo* si = start; (s == nullptr) && (si != nullptr); si = si->next) {
+  for (soinfo* si = start; si != nullptr; si = si->next) {
     if ((si->get_rtld_flags() & RTLD_GLOBAL) == 0) {
       continue;
     }
@@ -758,6 +764,30 @@
     }
   }
 
+  // If not found - look into local_group unless
+  // caller is part of the global group in which
+  // case we already did it.
+  if (s == nullptr && caller != nullptr &&
+      (caller->get_rtld_flags() & RTLD_GLOBAL) == 0) {
+    soinfo* local_group_root = caller->get_local_group_root();
+
+    if (handle == RTLD_DEFAULT) {
+      start = local_group_root;
+    }
+
+    for (soinfo* si = start; si != nullptr; si = si->next) {
+      if (si->get_local_group_root() != local_group_root) {
+        break;
+      }
+
+      s = si->find_symbol_by_name(symbol_name);
+      if (s != nullptr) {
+        *found = si;
+        break;
+      }
+    }
+  }
+
   if (s != nullptr) {
     TRACE_TYPE(LOOKUP, "%s s->st_value = %p, found->base = %p",
                name, reinterpret_cast<void*>(s->st_value), reinterpret_cast<void*>((*found)->base));
diff --git a/linker/linker.h b/linker/linker.h
index bf3e7bf..ec3d8f0 100644
--- a/linker/linker.h
+++ b/linker/linker.h
@@ -351,7 +351,7 @@
 soinfo* do_dlopen(const char* name, int flags, const android_dlextinfo* extinfo);
 void do_dlclose(soinfo* si);
 
-ElfW(Sym)* dlsym_linear_lookup(const char* name, soinfo** found, soinfo* start);
+ElfW(Sym)* dlsym_linear_lookup(const char* name, soinfo** found, soinfo* caller, void* handle);
 soinfo* find_containing_library(const void* addr);
 
 ElfW(Sym)* dlsym_handle_lookup(soinfo* si, soinfo** found, const char* name);
diff --git a/linker/linker_phdr.cpp b/linker/linker_phdr.cpp
index 2c4ca15..638c9d6 100644
--- a/linker/linker_phdr.cpp
+++ b/linker/linker_phdr.cpp
@@ -429,9 +429,15 @@
     ElfW(Addr) seg_page_start = PAGE_START(phdr->p_vaddr) + load_bias;
     ElfW(Addr) seg_page_end   = PAGE_END(phdr->p_vaddr + phdr->p_memsz) + load_bias;
 
+    int prot = PFLAGS_TO_PROT(phdr->p_flags);
+    if ((extra_prot_flags & PROT_WRITE) != 0) {
+      // make sure we're never simultaneously writable / executable
+      prot &= ~PROT_EXEC;
+    }
+
     int ret = mprotect(reinterpret_cast<void*>(seg_page_start),
                        seg_page_end - seg_page_start,
-                       PFLAGS_TO_PROT(phdr->p_flags) | extra_prot_flags);
+                       prot | extra_prot_flags);
     if (ret < 0) {
       return -1;
     }
diff --git a/tests/dlfcn_test.cpp b/tests/dlfcn_test.cpp
index 1061e84..a63c070 100644
--- a/tests/dlfcn_test.cpp
+++ b/tests/dlfcn_test.cpp
@@ -46,7 +46,7 @@
   ASSERT_EQ(17, g_ctor_function_called);
 }
 
-TEST(dlfcn, dlsym_in_self) {
+TEST(dlfcn, dlsym_in_executable) {
   dlerror(); // Clear any pending errors.
   void* self = dlopen(NULL, RTLD_NOW);
   ASSERT_TRUE(self != NULL);
@@ -64,6 +64,27 @@
   ASSERT_EQ(0, dlclose(self));
 }
 
+TEST(dlfcn, dlsym_from_sofile) {
+  void* handle = dlopen("libtest_dlsym_from_this.so", RTLD_LAZY | RTLD_LOCAL);
+  ASSERT_TRUE(handle != nullptr) << dlerror();
+
+  // check that we cant find '_test_dlsym_symbol' via dlsym(RTLD_DEFAULT)
+  void* symbol = dlsym(RTLD_DEFAULT, "test_dlsym_symbol");
+  ASSERT_TRUE(symbol == nullptr);
+  ASSERT_SUBSTR("undefined symbol: test_dlsym_symbol", dlerror());
+
+  typedef int* (*fn_t)();
+  fn_t fn = reinterpret_cast<fn_t>(dlsym(handle, "lookup_dlsym_symbol_using_RTLD_DEFAULT"));
+
+  ASSERT_TRUE(fn != nullptr) << dlerror();
+
+  int* ptr = fn();
+  ASSERT_TRUE(ptr != nullptr) << dlerror();
+  ASSERT_EQ(42, *ptr);
+
+  dlclose(handle);
+}
+
 TEST(dlfcn, dlsym_with_dependencies) {
   void* handle = dlopen("libtest_with_dependency.so", RTLD_NOW);
   ASSERT_TRUE(handle != NULL);
diff --git a/tests/gtest_main.cpp b/tests/gtest_main.cpp
index bf2b695..692b7e8 100644
--- a/tests/gtest_main.cpp
+++ b/tests/gtest_main.cpp
@@ -277,8 +277,8 @@
 // PrettyUnitTestResultPrinter. The reason for copy is that PrettyUnitTestResultPrinter
 // is defined and used in gtest.cc, which is hard to reuse.
 static void OnTestIterationStartPrint(const std::vector<TestCase>& testcase_list, size_t iteration,
-                                      size_t iteration_count) {
-  if (iteration_count > 1) {
+                                      int iteration_count) {
+  if (iteration_count != 1) {
     printf("\nRepeating all tests (iteration %zu) . . .\n\n", iteration);
   }
   ColoredPrintf(COLOR_GREEN,  "[==========] ");
@@ -743,7 +743,7 @@
 // makes deadlock to use fork in multi-thread.
 // Returns true if all tests run successfully, otherwise return false.
 static bool RunTestInSeparateProc(int argc, char** argv, std::vector<TestCase>& testcase_list,
-                                  size_t iteration_count, size_t job_count,
+                                  int iteration_count, size_t job_count,
                                   const std::string& xml_output_filename) {
   // Stop default result printer to avoid environment setup/teardown information for each test.
   testing::UnitTest::GetInstance()->listeners().Release(
@@ -762,7 +762,9 @@
 
   bool all_tests_passed = true;
 
-  for (size_t iteration = 1; iteration <= iteration_count; ++iteration) {
+  for (size_t iteration = 1;
+       iteration_count < 0 || iteration <= static_cast<size_t>(iteration_count);
+       ++iteration) {
     OnTestIterationStartPrint(testcase_list, iteration, iteration_count);
     int64_t iteration_start_time_ns = NanoTime();
     time_t epoch_iteration_start_time = time(NULL);
@@ -875,7 +877,7 @@
   int test_warnline_ms;
   std::string gtest_color;
   bool gtest_print_time;
-  size_t gtest_repeat;
+  int gtest_repeat;
   std::string gtest_output;
 };
 
@@ -993,12 +995,9 @@
     } else if (strcmp(args[i], "--gtest_print_time=0") == 0) {
       options.gtest_print_time = false;
     } else if (strncmp(args[i], "--gtest_repeat=", strlen("--gtest_repeat=")) == 0) {
-      int repeat = atoi(args[i] + strlen("--gtest_repeat="));
-      if (repeat < 0) {
-        fprintf(stderr, "invalid gtest_repeat count: %d\n", repeat);
-        return false;
-      }
-      options.gtest_repeat = repeat;
+      // If the value of gtest_repeat is < 0, then it indicates the tests
+      // should be repeated forever.
+      options.gtest_repeat = atoi(args[i] + strlen("--gtest_repeat="));
       // Remove --gtest_repeat=xx from arguments, so child process only run one iteration for a single test.
       args.erase(args.begin() + i);
       --i;
diff --git a/tests/libs/Android.mk b/tests/libs/Android.mk
index 665ce0c..eb0a52a 100644
--- a/tests/libs/Android.mk
+++ b/tests/libs/Android.mk
@@ -371,6 +371,15 @@
 include $(LOCAL_PATH)/Android.build.testlib.mk
 
 # -----------------------------------------------------------------------------
+# Library to check RTLD_LOCAL with dlsym in 'this'
+# -----------------------------------------------------------------------------
+libtest_dlsym_from_this_src_files := dlsym_from_this.cpp
+
+module := libtest_dlsym_from_this
+
+include $(LOCAL_PATH)/Android.build.testlib.mk
+
+# -----------------------------------------------------------------------------
 # Library with weak undefined function
 # -----------------------------------------------------------------------------
 libtest_dlopen_weak_undefined_func_src_files := \
diff --git a/tests/libs/dlsym_from_this.cpp b/tests/libs/dlsym_from_this.cpp
new file mode 100644
index 0000000..b5215c9
--- /dev/null
+++ b/tests/libs/dlsym_from_this.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <dlfcn.h>
+#include <stdio.h>
+
+int test_dlsym_symbol = 42;
+
+extern "C" int* lookup_dlsym_symbol_using_RTLD_DEFAULT() {
+  dlerror();
+  int* result = static_cast<int*>(dlsym(RTLD_DEFAULT, "test_dlsym_symbol"));
+  // TODO: remove this once b/20049306 is fixed
+  if (result == nullptr) {
+    printf("Cannot find the answer\n");
+  }
+  return result;
+}
+
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index 16bf9c0..5ab1f11 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -1326,14 +1326,17 @@
 }
 
 TEST(pthread, pthread_mutex_owner_tid_limit) {
+#if defined(__BIONIC__) && !defined(__LP64__)
   FILE* fp = fopen("/proc/sys/kernel/pid_max", "r");
   ASSERT_TRUE(fp != NULL);
   long pid_max;
   ASSERT_EQ(1, fscanf(fp, "%ld", &pid_max));
   fclose(fp);
-  // Current pthread_mutex uses 16 bits to represent owner tid.
-  // Change the implementation if we need to support higher value than 65535.
+  // Bionic's pthread_mutex implementation on 32-bit devices uses 16 bits to represent owner tid.
   ASSERT_LE(pid_max, 65536);
+#else
+  GTEST_LOG_(INFO) << "This test does nothing as 32-bit tid is supported by pthread_mutex.\n";
+#endif
 }
 
 class StrictAlignmentAllocator {