Align the child stack in clone(2).

Also let clone(2) set the TLS for x86.

Also ensure we initialize the TLS before we clone(2) for all architectures.

Change-Id: Ie5fa4466e1c9ee116a281dfedef574c5ba60c0b5
diff --git a/libc/arch-arm64/bionic/__bionic_clone.S b/libc/arch-arm64/bionic/__bionic_clone.S
index 74db790..ddd8ee0 100644
--- a/libc/arch-arm64/bionic/__bionic_clone.S
+++ b/libc/arch-arm64/bionic/__bionic_clone.S
@@ -35,9 +35,6 @@
     mov     x29,  sp
     str     x8,       [sp, #-16]!
 
-    # Align 'child_stack' to 16 bytes.
-    and     x1, x1, #~0xf
-
     # Copy 'fn' and 'arg' onto the child stack.
     stp     x5, x6, [x1, #-16]
 
diff --git a/libc/arch-x86/bionic/__bionic_clone.S b/libc/arch-x86/bionic/__bionic_clone.S
index cb0a363..af6ef19 100644
--- a/libc/arch-x86/bionic/__bionic_clone.S
+++ b/libc/arch-x86/bionic/__bionic_clone.S
@@ -6,9 +6,12 @@
         pushl   %esi
         pushl   %edi
 
-        # Align 'child_stack' to 16 bytes.
-        movl    20(%esp), %ecx
-        andl    $~15, %ecx
+        # Load system call arguments into registers.
+        movl    16(%esp), %ebx   # flags
+        movl    20(%esp), %ecx   # child_stack
+        movl    24(%esp), %edx   # parent_tid
+        movl    28(%esp), %esi   # tls
+        movl    32(%esp), %edi   # child_tid
 
         # Copy 'fn' and 'arg' onto the child stack
         movl    36(%esp), %eax   # Read 'fn'.
@@ -19,11 +22,6 @@
 
         # Make the system call.
         movl    $__NR_clone, %eax
-        movl    16(%esp), %ebx  # flags
-        #movl   %ecx, %ecx      # child stack (already there)
-        movl    24(%esp), %edx  # parent_tid
-        movl    28(%esp), %esi  # tls
-        movl    32(%esp), %edi  # child_tid
         int     $0x80
 
         # Check result.
diff --git a/libc/arch-x86/bionic/__set_tls.c b/libc/arch-x86/bionic/__set_tls.c
index 7ed4b01..722ec6f 100644
--- a/libc/arch-x86/bionic/__set_tls.c
+++ b/libc/arch-x86/bionic/__set_tls.c
@@ -25,77 +25,50 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
+#include <limits.h>
 #include <pthread.h>
+#include <stdbool.h>
 
+#include <asm/ldt.h>
 
-struct user_desc {
-    unsigned int    entry_number;
-    unsigned long   base_addr;
-    unsigned int    limit;
-    unsigned int    seg_32bit:1;
-    unsigned int    contents:2;
-    unsigned int    read_exec_only:1;
-    unsigned int    limit_in_pages:1;
-    unsigned int    seg_not_present:1;
-    unsigned int    useable:1;
-    unsigned int    empty:25;
-};
+extern int __set_thread_area(struct user_desc*);
 
-extern int __set_thread_area(struct user_desc *u_info);
+__LIBC_HIDDEN__ void __init_user_desc(struct user_desc* result, bool allocate, void* base_addr) {
+  if (allocate) {
+    // Let the kernel choose.
+    result->entry_number = -1;
+  } else {
+    // Get the existing entry number from %gs.
+    uint32_t gs;
+    __asm__ __volatile__("movw %%gs, %w0" : "=q"(gs) /*output*/);
+    result->entry_number = (gs & 0xffff) >> 3;
+  }
 
-/* the following can't be const, since the first call will
- * update the 'entry_number' field
- */
-static struct user_desc  _tls_desc =
-{
-    -1,
-    0,
-    0x1000,
-    1,
-    0,
-    0,
-    1,
-    0,
-    1,
-    0
-};
+  result->base_addr = (uintptr_t) base_addr;
 
-static pthread_mutex_t  _tls_desc_lock = PTHREAD_MUTEX_INITIALIZER;
+  result->limit = PAGE_SIZE;
 
-struct _thread_area_head {
-    void *self;
-};
-
-/* we implement thread local storage through the gs: segment descriptor
- * we create a segment descriptor for the tls
- */
-int __set_tls(void *ptr)
-{
-    int   rc, segment;
-
-    pthread_mutex_lock(&_tls_desc_lock);
-    _tls_desc.base_addr = (unsigned long)ptr;
-
-    /* We also need to write the location of the tls to ptr[0] */
-    ((struct _thread_area_head *)ptr)->self = ptr;
-
-    rc = __set_thread_area( &_tls_desc );
-    if (rc != 0)
-    {
-        /* could not set thread local area */
-        pthread_mutex_unlock(&_tls_desc_lock);
-        return -1;
-    }
-
-    /* this weird computation comes from GLibc */
-    segment = _tls_desc.entry_number*8 + 3;
-    asm __volatile__ (
-        "   movw %w0, %%gs" :: "q"(segment)
-    );
-    pthread_mutex_unlock(&_tls_desc_lock);
-
-    return 0;
+  result->seg_32bit = 1;
+  result->contents = MODIFY_LDT_CONTENTS_DATA;
+  result->read_exec_only = 0;
+  result->limit_in_pages = 1;
+  result->seg_not_present = 0;
+  result->useable = 1;
 }
 
+int __set_tls(void* ptr) {
+  struct user_desc tls_descriptor;
+  __init_user_desc(&tls_descriptor, true, ptr);
 
+  int rc = __set_thread_area(&tls_descriptor);
+  if (rc != -1) {
+    // Change %gs to be new GDT entry.
+    uint16_t table_indicator = 0;  // GDT
+    uint16_t rpl = 3;  // Requested privilege level
+    uint16_t selector = (tls_descriptor.entry_number << 3) | table_indicator | rpl;
+    __asm__ __volatile__("movw %w0, %%gs" : /*output*/ : "q"(selector) /*input*/ : /*clobber*/);
+  }
 
+  return rc;
+}
diff --git a/libc/arch-x86_64/bionic/__bionic_clone.S b/libc/arch-x86_64/bionic/__bionic_clone.S
index 62c9666..db7d05c 100644
--- a/libc/arch-x86_64/bionic/__bionic_clone.S
+++ b/libc/arch-x86_64/bionic/__bionic_clone.S
@@ -30,9 +30,6 @@
 
 // pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
-        # Align 'child_stack' to 16 bytes.
-        andq    $~15, %rsi
-
         # Copy 'fn' and 'arg' onto the child stack.
         movq    %r9, -16(%rsi)  # fn
         movq    8(%rsp), %rax   # Read 'arg'.
diff --git a/libc/arch-x86_64/bionic/__set_tls.c b/libc/arch-x86_64/bionic/__set_tls.c
index 9a69449..cc7a5f4 100644
--- a/libc/arch-x86_64/bionic/__set_tls.c
+++ b/libc/arch-x86_64/bionic/__set_tls.c
@@ -32,7 +32,5 @@
 extern int __arch_prctl(int, unsigned long);
 
 int __set_tls(void* ptr) {
-  // We also need to write the location of the tls to ptr[0].
-  *(void**) ptr = ptr;
   return __arch_prctl(ARCH_SET_FS, (uintptr_t) ptr);
 }
diff --git a/libc/bionic/clone.cpp b/libc/bionic/clone.cpp
index 4ea3c70..2c507c4 100644
--- a/libc/bionic/clone.cpp
+++ b/libc/bionic/clone.cpp
@@ -59,5 +59,10 @@
   }
   va_end(args);
 
+  // Align 'child_stack' to 16 bytes.
+  uintptr_t child_stack_addr = reinterpret_cast<uintptr_t>(child_stack);
+  child_stack_addr &= ~0xf;
+  child_stack = reinterpret_cast<void*>(child_stack_addr);
+
   return __bionic_clone(flags, child_stack, parent_tid, new_tls, child_tid, fn, arg);
 }
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 3d0daf7..f62dc15 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -39,9 +39,6 @@
 #include "private/ErrnoRestorer.h"
 #include "private/ScopedPthreadMutexLocker.h"
 
-extern "C" pid_t __bionic_clone(uint32_t flags, void* child_stack, int* parent_tid, void* tls, int* child_tid, int (*fn)(void*), void* arg);
-extern "C" int __set_tls(void*);
-
 // Used by gdb to track thread creation. See libthread_db.
 #ifdef __i386__
 extern "C" __attribute__((noinline)) __attribute__((fastcall)) void _thread_created_hook(pid_t) {}
@@ -49,6 +46,12 @@
 extern "C" __attribute__((noinline)) void _thread_created_hook(pid_t) {}
 #endif
 
+// x86 uses segment descriptors rather than a direct pointer to TLS.
+#if __i386__
+#include <asm/ldt.h>
+extern "C" __LIBC_HIDDEN__ void __init_user_desc(struct user_desc*, int, void*);
+#endif
+
 static pthread_mutex_t gPthreadStackCreationLock = PTHREAD_MUTEX_INITIALIZER;
 
 static pthread_mutex_t gDebuggerNotificationLock = PTHREAD_MUTEX_INITIALIZER;
@@ -62,10 +65,6 @@
     thread->tls[i] = NULL;
   }
 
-#if defined(__i386__)
-  __set_tls(thread->tls);
-#endif
-
   // Slot 0 must point to itself. The x86 Linux kernel reads the TLS from %fs:0.
   thread->tls[TLS_SLOT_SELF] = thread->tls;
   thread->tls[TLS_SLOT_THREAD_ID] = thread;
@@ -148,8 +147,7 @@
   pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_lock(start_mutex);
   pthread_mutex_destroy(start_mutex);
-
-  __init_tls(thread);
+  thread->tls[TLS_SLOT_START_MUTEX] = NULL;
 
   __init_alternate_signal_stack(thread);
 
@@ -208,6 +206,7 @@
   // At offsets < 0, we have the child stack.
   thread->tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
   void* child_stack = thread->tls;
+  __init_tls(thread);
 
   // Create a mutex for the thread in TLS to wait on once it starts so we can keep
   // it from doing anything until after we notify the debugger about it
@@ -219,20 +218,20 @@
   pthread_mutex_init(start_mutex, NULL);
   pthread_mutex_lock(start_mutex);
 
-  thread->tls[TLS_SLOT_THREAD_ID] = thread;
-
   thread->start_routine = start_routine;
   thread->start_routine_arg = arg;
 
   int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
       CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
+  void* tls = thread->tls;
 #if defined(__i386__)
   // On x86 (but not x86-64), CLONE_SETTLS takes a pointer to a struct user_desc rather than
-  // a pointer to the TLS itself. Rather than try to deal with that here, we just let x86 set
-  // the TLS manually in __init_tls, like all architectures used to.
-  flags &= ~CLONE_SETTLS;
+  // a pointer to the TLS itself.
+  user_desc tls_descriptor;
+  __init_user_desc(&tls_descriptor, false, tls);
+  tls = &tls_descriptor;
 #endif
-  int rc = __bionic_clone(flags, child_stack, &(thread->tid), thread->tls, &(thread->tid), __pthread_start, thread);
+  int rc = clone(__pthread_start, child_stack, flags, thread, &(thread->tid), tls, &(thread->tid));
   if (rc == -1) {
     int clone_errno = errno;
     // We don't have to unlock the mutex at all because clone(2) failed so there's no child waiting to