Switch pthread_create over to __bionic_clone.

Bug: 8206355
Bug: 11693195
Change-Id: I04aadbc36c87e1b7e33324b9a930a1e441fbfed6
diff --git a/libc/arch-arm/bionic/_exit_with_stack_teardown.S b/libc/arch-arm/bionic/_exit_with_stack_teardown.S
index c430edb..0d97f06 100644
--- a/libc/arch-arm/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-arm/bionic/_exit_with_stack_teardown.S
@@ -33,12 +33,11 @@
 ENTRY(_exit_with_stack_teardown)
     mov     lr, r2
     ldr     r7, =__NR_munmap
-    swi     #0              // the stack is destroyed by this call
+    swi     #0
+    // If munmap failed, we ignore the failure and exit anyway.
+
     mov     r0, lr
     ldr     r7, =__NR_exit
     swi     #0
-
-    // exit() should never return, cause a crash if it does
-    mov     r0, #0
-    ldr     r0, [r0]
+    // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-arm/bionic/clone.S b/libc/arch-arm/bionic/clone.S
index 3556b8e..0782abe 100644
--- a/libc/arch-arm/bionic/clone.S
+++ b/libc/arch-arm/bionic/clone.S
@@ -28,53 +28,7 @@
 
 #include <private/bionic_asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
-ENTRY(__pthread_clone)
-    # Push 'fn' and 'arg' onto 'child_stack'.
-    stmdb   r1!, {r0, r3}
-
-    # The sys_clone system call only takes two arguments: 'flags' and 'child_stack'.
-    # 'child_stack' is already in r1, but we need to move 'flags' into position.
-    mov     r0, r2
-
-    # System call.
-    mov     ip, r7
-    ldr     r7, =__NR_clone
-    swi     #0
-
-    # Child?
-    movs    r0, r0
-    beq     1f
-
-    # Parent.
-    mov     r7, ip
-    cmn     r0, #(MAX_ERRNO + 1)
-    bxls    lr
-    neg     r0, r0
-    b       __set_errno
-
-1:  # Child.
-    # Pop 'fn' and 'arg' back off the stack and call __thread_entry.
-    pop     {r0, r1}
-    # __thread_entry also needs our stack pointer.
-    mov     r2, sp
-    b       __thread_entry
-END(__pthread_clone)
-
-
-    #
-    # This function is defined as:
-    #
-    #   pid_t  __bionic_clone( int  flags, void *child_stack,
-    #                          pid_t *pid, void *tls, pid_t *ctid,
-    #                          int  (*fn)(void *), void* arg );
-    #
-    # NOTE: This is not the same signature as the glibc
-    #       __clone function. Placing 'fn' and 'arg'
-    #       at the end of the parameter list makes the
-    #       implementation much simpler.
-    #
-
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
     mov     ip, sp
     .save   {r4, r5, r6, r7}
diff --git a/libc/arch-mips/bionic/_exit_with_stack_teardown.S b/libc/arch-mips/bionic/_exit_with_stack_teardown.S
index 8351e2e..9cab52b 100644
--- a/libc/arch-mips/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-mips/bionic/_exit_with_stack_teardown.S
@@ -30,7 +30,7 @@
 
 	.text
 
-// void _exit_with_stack_teardown(void * stackBase, size_t stackSize, int status)
+// void _exit_with_stack_teardown(void* stackBase, size_t stackSize, int status)
 
 	.type	_exit_with_stack_teardown, @function
 	.global	_exit_with_stack_teardown
@@ -40,12 +40,11 @@
 	move	$s0,$a2		/* preserve status for exit() call */
 
 	li	$v0,__NR_munmap
-	syscall			/* the stack is destroyed by this call */
+	syscall
+	// If munmap failed, we ignore the failure and exit anyway.
+
 	move	$a0,$s0
 	li	$v0,__NR_exit
 	syscall
-
-	/* exit() should never return, cause a crash if it does */
-	move	$a0,$0
-	lw	$a0,($a0)
+        // The exit syscall does not return.
 	.end	_exit_with_stack_teardown
diff --git a/libc/arch-mips/bionic/clone.S b/libc/arch-mips/bionic/clone.S
index 94e18a2..8970b6e 100644
--- a/libc/arch-mips/bionic/clone.S
+++ b/libc/arch-mips/bionic/clone.S
@@ -30,78 +30,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 
-	.text
-	.type __pthread_clone, @function
-	.global __pthread_clone
-	.align 4
-        .ent __pthread_clone
-
-/*
- * int __pthread_clone(void* (*fn)(void*), void *child_stack,
- *			 int flags, void *arg);
- */
-
-__pthread_clone:
-        .set	noreorder
-        .cpload $t9
-        .set	reorder
-
-	# set up child stack
-	subu	$a1,16
-	sw	$a0,0($a1)	# fn
-	sw	$a3,4($a1)	# arg
-#	sw	$a1+16,8($a1)	# tls
-
-	/*
-	 * int sys_clone(int flags, void *child_stack, int *parent_tidptr,
-	 *	 struct user_desc *newtls, int *child_tidptr);
-	 */
-
-	move	$a0,$a2		# flags
-#	move	$a1,$a1		# child_stack
-	move	$a2,$0		# parent_tidptr
-	move	$a3,$0		# user_desc
-	and	$a0,~(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
-				# make sure the kernel doesn't access child_tidptr
-
-        li	$v0,__NR_clone
-        syscall
-
-        bnez	$a3,.L__error
-
-        beqz	$v0,.L__thread_start
-
-        j $ra
-
-.L__thread_start:
-        lw	$a0,0($sp)	#  fn
-        lw	$a1,4($sp)	#  arg
-        addu	$a2,$sp,16	#  tls
-
-	# void __thread_entry(void* (*func)(void*), void *arg, void *tls)
-        la	$t9, __thread_entry
-        j	$t9
-
-.L__error:
-	move	$a0,$v0
-	la	$t9,__set_errno
-	j	$t9
-
-        .end __pthread_clone
-
-
-    #
-    # This function is defined as:
-    #
-    #   pid_t  __bionic_clone( int  flags, void *child_stack,
-    #                          pid_t *pid, void *tls, pid_t *ctid,
-    #                          int  (*fn)(void *), void* arg );
-    #
-    # NOTE: This is not the same signature than the GLibc
-    #       __clone function here !! Placing 'fn' and 'arg'
-    #       at the end of the parameter list makes the
-    #       implementation much simpler.
-    #
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 	.text
 	.type __bionic_clone, @function
 	.global __bionic_clone
diff --git a/libc/arch-x86/bionic/_exit_with_stack_teardown.S b/libc/arch-x86/bionic/_exit_with_stack_teardown.S
index 1c6d48a..9128f10 100644
--- a/libc/arch-x86/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-x86/bionic/_exit_with_stack_teardown.S
@@ -6,17 +6,15 @@
     // We can trash %ebx here since this call should never return.
     // We can also take advantage of the fact that the linux syscall trap
     // handler saves all the registers, so we don't need a stack to keep
-    // the status argument for exit while doing the munmap */
+    // the status argument for exit while doing the munmap.
     mov     4(%esp), %ebx             // stackBase
     mov     8(%esp), %ecx             // stackSize
     mov     $__NR_munmap, %eax
     int     $0x80
-
     // If munmap failed, we ignore the failure and exit anyway.
 
     mov     %edx, %ebx                // status
     movl    $__NR_exit, %eax
     int     $0x80
-
     // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-x86/bionic/clone.S b/libc/arch-x86/bionic/clone.S
index 457cb4a..eb9f545 100644
--- a/libc/arch-x86/bionic/clone.S
+++ b/libc/arch-x86/bionic/clone.S
@@ -1,68 +1,7 @@
 #include <asm/unistd.h>
 #include <machine/asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* tls, int flags, void* arg);
-ENTRY(__pthread_clone)
-        pushl   %ebx
-        pushl   %ecx
-        movl    16(%esp), %ecx
-
-        # save tls
-        movl    %ecx, %ebx
-        # 16-byte alignment on child stack
-        andl    $~15, %ecx
-
-        # insert arguments onto the child stack
-        movl    12(%esp), %eax
-        movl    %eax, -16(%ecx)
-        movl    24(%esp), %eax
-        movl    %eax, -12(%ecx)
-        movl    %ebx, -8(%ecx)
-
-        subl    $16, %ecx
-        movl    20(%esp), %ebx
-
-        # make system call
-        movl    $__NR_clone, %eax
-        int     $0x80
-
-        cmpl    $0, %eax
-        je      pc_child
-        jg      pc_parent
-
-        # an error occurred, set errno and return -1
-        negl    %eax
-        pushl   %eax
-        call    __set_errno
-        addl    $4, %esp
-        orl     $-1, %eax
-        jmp     pc_return
-
-pc_child:
-        # we're in the child thread now, call __thread_entry
-        # with the appropriate arguments on the child stack
-        # we already placed most of them
-        call    __thread_entry
-        hlt
-
-pc_parent:
-        # we're the parent; nothing to do.
-pc_return:
-        popl    %ecx
-        popl    %ebx
-        ret
-END(__pthread_clone)
-
-
-/*
- * int  __bionic_clone(unsigned long clone_flags,
- *                     void*         newsp,
- *                     int           *parent_tidptr,
- *                     void          *new_tls,
- *                     int           *child_tidptr,
- *                     int           (*fn)(void *),
- *                     void          *arg);
- */
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
         pushl   %ebx
         pushl   %esi
diff --git a/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S b/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
index a09babe..eca3b68 100644
--- a/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
+++ b/libc/arch-x86_64/bionic/_exit_with_stack_teardown.S
@@ -36,12 +36,10 @@
     // the status argument for exit(2) while doing the munmap(2).
     mov     $__NR_munmap, %eax
     syscall
-
-    // If munmap failed, ignore the failure and exit anyway.
+    // If munmap failed, we ignore the failure and exit anyway.
 
     mov     %rdx, %rdi        // status
     mov     $__NR_exit, %eax
     syscall
-
     // The exit syscall does not return.
 END(_exit_with_stack_teardown)
diff --git a/libc/arch-x86_64/bionic/clone.S b/libc/arch-x86_64/bionic/clone.S
index 7511e86..b37416b 100644
--- a/libc/arch-x86_64/bionic/clone.S
+++ b/libc/arch-x86_64/bionic/clone.S
@@ -29,53 +29,7 @@
 #include <asm/unistd.h>
 #include <machine/asm.h>
 
-// int  __pthread_clone(void* (*fn)(void*), void* tls, int flags, void* arg);
-ENTRY(__pthread_clone)
-        # Save tls.
-        movq    %rsi, %r11
-        # Enforce 16-byte alignment for child stack.
-        andq    $~15, %rsi
-
-        # Copy 'fn', 'arg', and 'tls' onto the child stack.
-        movq    %rdi, -32(%rsi)  # fn
-        movq    %rcx, -24(%rsi)  # arg
-        movq    %r11, -16(%rsi)  # tls
-        subq    $32, %rsi
-
-        movq    %rdx, %rdi
-        movl    $__NR_clone, %eax
-        syscall
-        testl   %eax, %eax
-        jns     1f
-
-        # An error occurred, set errno and return -1.
-        negl    %eax
-        movl    %eax, %edi
-        call    __set_errno
-        orl     $-1, %eax
-        jmp     2f
-1:
-        jnz     2f
-
-        # We're in the child now, so call __thread_entry
-        # with the arguments from the child stack moved into
-        # the appropriate registers. We avoid pop here to keep
-        # the required 16-byte stack alignment.
-        movq    (%rsp), %rdi    # fn
-        movq    8(%rsp), %rsi   # arg
-        movq    16(%rsp), %rdx  # tls
-        call    __thread_entry
-        hlt
-2:
-        ret
-
-// int __bionic_clone(unsigned long clone_flags,
-//                    void* new_sp,
-//                    int* parent_tid_ptr,
-//                    void* new_tls,
-//                    int* child_tid_ptr,
-//                    int (*fn)(void*),
-//                    void* arg);
+// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
 ENTRY(__bionic_clone)
         # Enforce 16-byte alignment for child stack.
         andq    $~15, %rsi
diff --git a/libc/bionic/__thread_entry.cpp b/libc/bionic/__thread_entry.cpp
index 8300a64..3505f8b 100644
--- a/libc/bionic/__thread_entry.cpp
+++ b/libc/bionic/__thread_entry.cpp
@@ -32,26 +32,28 @@
 
 #include "private/bionic_tls.h"
 
-// This trampoline is called from the assembly _pthread_clone function.
-// Our 'tls' and __pthread_clone's 'child_stack' are one and the same, just growing in
-// opposite directions.
-extern "C" void __thread_entry(void* (*func)(void*), void* arg, void** tls) {
+// This trampoline is called from the assembly __bionic_clone function.
+int __thread_entry(void* arg) {
+  pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(arg);
+
   // Wait for our creating thread to release us. This lets it have time to
   // notify gdb about this thread before we start doing anything.
   // This also provides the memory barrier needed to ensure that all memory
   // accesses previously made by the creating thread are visible to us.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &tls[TLS_SLOT_SELF];
+  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_lock(start_mutex);
   pthread_mutex_destroy(start_mutex);
 
-  pthread_internal_t* thread = (pthread_internal_t*) tls[TLS_SLOT_THREAD_ID];
-  thread->tls = tls;
   __init_tls(thread);
 
+  __init_alternate_signal_stack(thread);
+
   if ((thread->internal_flags & PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED) != 0) {
     pthread_exit(NULL);
   }
 
-  void* result = func(arg);
+  void* result = thread->start_routine(thread->start_routine_arg);
   pthread_exit(result);
+
+  return 0;
 }
diff --git a/libc/bionic/bionic_clone.c b/libc/bionic/bionic_clone.c
index 8a17e13..518d996 100644
--- a/libc/bionic/bionic_clone.c
+++ b/libc/bionic/bionic_clone.c
@@ -31,14 +31,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-extern int  __bionic_clone(unsigned long   clone_flags,
-                           void*           newsp,
-                           int            *parent_tidptr,
-                           void           *new_tls,
-                           int            *child_tidptr,
-                           int            (*fn)(void *),
-                           void          *arg);
-
+extern pid_t __bionic_clone(uint32_t flags, void* child_stack, int* parent_tid, void* tls, int* child_tid, int (*fn)(void*), void* arg);
 extern void __exit(int status);
 
 /* this function is called from the __bionic_clone
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index f88a26d..3e092ae 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -49,6 +49,7 @@
 extern "C" abort_msg_t** __abort_message_ptr;
 extern "C" uintptr_t __get_sp(void);
 extern "C" int __system_properties_init(void);
+extern "C" int __set_tls(void* ptr);
 
 // Not public, but well-known in the BSDs.
 const char* __progname;
@@ -96,7 +97,10 @@
   pthread_attr_setstack(&thread.attr, (void*) stack_bottom, stack_size);
   _init_thread(&thread, false);
   __init_tls(&thread);
+  __set_tls(thread.tls);
   tls[TLS_SLOT_BIONIC_PREINIT] = &args;
+
+  __init_alternate_signal_stack(&thread);
 }
 
 void __libc_init_common(KernelArgumentBlock& args) {
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 2153310..386e8d1 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -40,7 +40,7 @@
 #include "private/ErrnoRestorer.h"
 #include "private/ScopedPthreadMutexLocker.h"
 
-extern "C" int __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
+extern "C" pid_t __bionic_clone(uint32_t flags, void* child_stack, int* parent_tid, void* tls, int* child_tid, int (*fn)(void*), void* arg);
 
 #ifdef __i386__
 #define ATTRIBUTES __attribute__((noinline)) __attribute__((fastcall))
@@ -50,15 +50,14 @@
 
 extern "C" void ATTRIBUTES _thread_created_hook(pid_t thread_id);
 
-extern "C" int __set_tls(void* ptr);
-
 static pthread_mutex_t gPthreadStackCreationLock = PTHREAD_MUTEX_INITIALIZER;
 
 static pthread_mutex_t gDebuggerNotificationLock = PTHREAD_MUTEX_INITIALIZER;
 
+// This code is used both by each new pthread and the code that initializes the main thread.
 void  __init_tls(pthread_internal_t* thread) {
-  // Zero-initialize all the slots.
-  for (size_t i = 0; i < BIONIC_TLS_SLOTS; ++i) {
+  // Zero-initialize all the slots after TLS_SLOT_SELF and TLS_SLOT_THREAD_ID.
+  for (size_t i = TLS_SLOT_ERRNO; i < BIONIC_TLS_SLOTS; ++i) {
     thread->tls[i] = NULL;
   }
 
@@ -67,11 +66,10 @@
   thread->tls[TLS_SLOT_THREAD_ID] = thread;
   // GCC looks in the TLS for the stack guard on x86, so copy it there from our global.
   thread->tls[TLS_SLOT_STACK_GUARD] = (void*) __stack_chk_guard;
+}
 
-  __set_tls(thread->tls);
-
+void __init_alternate_signal_stack(pthread_internal_t* thread) {
   // Create and set an alternate signal stack.
-  // This must happen after __set_tls, in case a system call fails and tries to set errno.
   stack_t ss;
   ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
   if (ss.ss_sp != MAP_FAILED) {
@@ -181,24 +179,26 @@
   // The child stack is the same address, just growing in the opposite direction.
   // At offsets >= 0, we have the TLS slots.
   // At offsets < 0, we have the child stack.
-  void** tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
-  void* child_stack = tls;
+  thread->tls = (void**)((uint8_t*)(thread->attr.stack_base) + thread->attr.stack_size - BIONIC_TLS_SLOTS * sizeof(void*));
+  void* child_stack = thread->tls;
 
-  // Create a mutex for the thread in TLS_SLOT_SELF to wait on once it starts so we can keep
+  // Create a mutex for the thread in TLS to wait on once it starts so we can keep
   // it from doing anything until after we notify the debugger about it
   //
   // This also provides the memory barrier we need to ensure that all
   // memory accesses previously performed by this thread are visible to
   // the new thread.
-  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &tls[TLS_SLOT_SELF];
+  pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_init(start_mutex, NULL);
   ScopedPthreadMutexLocker start_locker(start_mutex);
 
-  tls[TLS_SLOT_THREAD_ID] = thread;
+  thread->tls[TLS_SLOT_THREAD_ID] = thread;
 
-  int flags = CLONE_FILES | CLONE_FS | CLONE_VM | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM;
+  thread->start_routine = start_routine;
+  thread->start_routine_arg = arg;
 
-  int tid = __pthread_clone(start_routine, child_stack, flags, arg);
+  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS;
+  int tid = __bionic_clone(flags, child_stack, NULL, thread->tls, NULL, __thread_entry, thread);
   if (tid < 0) {
     int clone_errno = errno;
     if ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) == 0) {
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index 8cca83a..52cfbce 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -42,6 +42,9 @@
     __pthread_cleanup_t*        cleanup_stack;
     void**                      tls;         /* thread-local storage area */
 
+    void* (*start_routine)(void*);
+    void* start_routine_arg;
+
     void* alternate_signal_stack;
 
     /*
@@ -52,8 +55,12 @@
     char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
 };
 
+extern "C" {
+  __LIBC_HIDDEN__ int __thread_entry(void* arg); // Called from assembler.
+}
 __LIBC_HIDDEN__ int _init_thread(pthread_internal_t* thread, bool add_to_thread_list);
 __LIBC_HIDDEN__ void __init_tls(pthread_internal_t* thread);
+__LIBC_HIDDEN__ void __init_alternate_signal_stack(pthread_internal_t*);
 __LIBC_HIDDEN__ void _pthread_internal_add(pthread_internal_t* thread);
 __LIBC_HIDDEN__ pthread_internal_t* __get_thread(void);
 
diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h
index a14bd3c..094c71c 100644
--- a/libc/private/bionic_tls.h
+++ b/libc/private/bionic_tls.h
@@ -51,6 +51,9 @@
   TLS_SLOT_THREAD_ID,
   TLS_SLOT_ERRNO,
 
+  /* This slot is used when starting a new thread, before any code that needs errno runs. */
+  TLS_SLOT_START_MUTEX = TLS_SLOT_ERRNO,
+
   /* These two aren't used by bionic itself, but allow the graphics code to
    * access TLS directly rather than using the pthread API. */
   TLS_SLOT_OPENGL_API = 3,