Align the child stack in clone(2).

Also let clone(2) set the TLS for x86.

Also ensure we initialize the TLS before we clone(2) for all architectures.

Change-Id: Ie5fa4466e1c9ee116a281dfedef574c5ba60c0b5
diff --git a/libc/arch-x86/bionic/__bionic_clone.S b/libc/arch-x86/bionic/__bionic_clone.S
index cb0a363..af6ef19 100644
--- a/libc/arch-x86/bionic/__bionic_clone.S
+++ b/libc/arch-x86/bionic/__bionic_clone.S
@@ -6,9 +6,12 @@
         pushl   %esi
         pushl   %edi
 
-        # Align 'child_stack' to 16 bytes.
-        movl    20(%esp), %ecx
-        andl    $~15, %ecx
+        # Load system call arguments into registers.
+        movl    16(%esp), %ebx   # flags
+        movl    20(%esp), %ecx   # child_stack
+        movl    24(%esp), %edx   # parent_tid
+        movl    28(%esp), %esi   # tls
+        movl    32(%esp), %edi   # child_tid
 
         # Copy 'fn' and 'arg' onto the child stack
         movl    36(%esp), %eax   # Read 'fn'.
@@ -19,11 +22,6 @@
 
         # Make the system call.
         movl    $__NR_clone, %eax
-        movl    16(%esp), %ebx  # flags
-        #movl   %ecx, %ecx      # child stack (already there)
-        movl    24(%esp), %edx  # parent_tid
-        movl    28(%esp), %esi  # tls
-        movl    32(%esp), %edi  # child_tid
         int     $0x80
 
         # Check result.