Align the child stack in clone(2).

Also let clone(2) set the TLS for x86.

Also ensure we initialize the TLS before we clone(2) for all architectures.

Change-Id: Ie5fa4466e1c9ee116a281dfedef574c5ba60c0b5
diff --git a/libc/bionic/clone.cpp b/libc/bionic/clone.cpp
index 4ea3c70..2c507c4 100644
--- a/libc/bionic/clone.cpp
+++ b/libc/bionic/clone.cpp
@@ -59,5 +59,10 @@
   }
   va_end(args);
 
+  // Align 'child_stack' to 16 bytes.
+  uintptr_t child_stack_addr = reinterpret_cast<uintptr_t>(child_stack);
+  child_stack_addr &= ~0xf;
+  child_stack = reinterpret_cast<void*>(child_stack_addr);
+
   return __bionic_clone(flags, child_stack, parent_tid, new_tls, child_tid, fn, arg);
 }