Revert "Revert "Lose the hand-written futex assembler.""

The problem with the original patch was that using syscall(3) means that
errno can be set, but pthread_create(3) was abusing the TLS errno slot as
a pthread_mutex_t for the thread startup handshake.

There was also a mistake in the check for syscall failures --- it should
have checked against -1 instead of 0 (not just because that's the default
idiom, but also here because futex(2) can legitimately return values > 0).

This patch stops abusing the TLS errno slot and adds a pthread_mutex_t to
pthread_internal_t instead. (Note that for LP64 sizeof(pthread_mutex_t) >
sizeof(uintptr_t), so we could potentially clobber other TLS slots too.)

I've also rewritten the LP32 compatibility stubs to directly reuse the
code from the .h file.

This reverts commit 75c55ff84ebfa686c7ae2cc8ee431c6a33bd46b4.

Bug: 15195455
Change-Id: I6ffb13e5cf6a35d8f59f692d94192aae9ab4593d
diff --git a/libc/bionic/ndk_cruft.cpp b/libc/bionic/ndk_cruft.cpp
index 4900a8a..1284b9a 100644
--- a/libc/bionic/ndk_cruft.cpp
+++ b/libc/bionic/ndk_cruft.cpp
@@ -31,7 +31,6 @@
 
 #include <ctype.h>
 #include <inttypes.h>
-#include <linux/futex.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -201,25 +200,20 @@
   return vdprintf(fd, fmt, ap);
 }
 
-static inline int __futex(volatile void* ftx, int op, int value, const struct timespec* timeout) {
-  // Our generated syscall assembler sets errno, but our callers (pthread functions) don't want to.
-  int saved_errno = errno;
-  if (syscall(__NR_futex, ftx, op, value, timeout) == 0) {
-    return 0;
-  }
-  int result = -errno;
-  errno = saved_errno;
-  return result;
-}
+#define __futex_wake __real_futex_wake
+#define __futex_wait __real_futex_wait
+#include "private/bionic_futex.h"
+#undef __futex_wake
+#undef __futex_wait
 
 // This used to be in <sys/atomics.h>.
 extern "C" int __futex_wake(volatile void* ftx, int count) {
-  return __futex(ftx, FUTEX_WAKE, count, NULL);
+  return __real_futex_wake(ftx, count);
 }
 
 // This used to be in <sys/atomics.h>.
 extern "C" int __futex_wait(volatile void* ftx, int value, const struct timespec* timeout) {
-  return __futex(ftx, FUTEX_WAIT, value, timeout);
+  return __real_futex_wait(ftx, value, timeout);
 }
 
 // Unity's libmono uses this.
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 303af81..c4cb262 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -144,10 +144,8 @@
   // notify gdb about this thread before we start doing anything.
   // This also provides the memory barrier needed to ensure that all memory
   // accesses previously made by the creating thread are visible to us.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
-  pthread_mutex_lock(start_mutex);
-  pthread_mutex_destroy(start_mutex);
-  thread->tls[TLS_SLOT_START_MUTEX] = NULL;
+  pthread_mutex_lock(&thread->startup_handshake_mutex);
+  pthread_mutex_destroy(&thread->startup_handshake_mutex);
 
   __init_alternate_signal_stack(thread);
 
@@ -204,7 +202,8 @@
   // The child stack is the same address, just growing in the opposite direction.
   // At offsets >= 0, we have the TLS slots.
   // At offsets < 0, we have the child stack.
-  thread->tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
+  thread->tls = reinterpret_cast<void**>(reinterpret_cast<uint8_t*>(thread->attr.stack_base) +
+                                         thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
   void* child_stack = thread->tls;
   __init_tls(thread);
 
@@ -214,9 +213,8 @@
   // This also provides the memory barrier we need to ensure that all
   // memory accesses previously performed by this thread are visible to
   // the new thread.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
-  pthread_mutex_init(start_mutex, NULL);
-  pthread_mutex_lock(start_mutex);
+  pthread_mutex_init(&thread->startup_handshake_mutex, NULL);
+  pthread_mutex_lock(&thread->startup_handshake_mutex);
 
   thread->start_routine = start_routine;
   thread->start_routine_arg = arg;
@@ -237,7 +235,7 @@
     // We don't have to unlock the mutex at all because clone(2) failed so there's no child waiting to
     // be unblocked, but we're about to unmap the memory the mutex is stored in, so this serves as a
     // reminder that you can't rewrite this function to use a ScopedPthreadMutexLocker.
-    pthread_mutex_unlock(start_mutex);
+    pthread_mutex_unlock(&thread->startup_handshake_mutex);
     if ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) == 0) {
       munmap(thread->attr.stack_base, thread->attr.stack_size);
     }
@@ -252,7 +250,7 @@
     // Letting the thread run is the easiest way to clean up its resources.
     thread->attr.flags |= PTHREAD_ATTR_FLAG_DETACHED;
     thread->start_routine = __do_nothing;
-    pthread_mutex_unlock(start_mutex);
+    pthread_mutex_unlock(&thread->startup_handshake_mutex);
     return init_errno;
   }
 
@@ -264,7 +262,7 @@
 
   // Publish the pthread_t and unlock the mutex to let the new thread start running.
   *thread_out = reinterpret_cast<pthread_t>(thread);
-  pthread_mutex_unlock(start_mutex);
+  pthread_mutex_unlock(&thread->startup_handshake_mutex);
 
   return 0;
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 295d9d6..490ae86 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -48,6 +48,8 @@
 
   void* alternate_signal_stack;
 
+  pthread_mutex_t startup_handshake_mutex;
+
   /*
    * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
    * per-thread buffer by simply using malloc(3) and free(3).