Align arm64 stacks to 16 bytes in __bionic_clone.

Also ensure that arm64/x86-64/x86 assembler uses local labels.
(There are are so many non-local labels in arm that fixing them
seems out of scope.)

Also synchronize the __bionic_clone.S comments.

Change-Id: I03b4f84780d996b54d6637a074638196bbb01cd4
diff --git a/libc/arch-arm64/bionic/__bionic_clone.S b/libc/arch-arm64/bionic/__bionic_clone.S
index af91320..74db790 100644
--- a/libc/arch-arm64/bionic/__bionic_clone.S
+++ b/libc/arch-arm64/bionic/__bionic_clone.S
@@ -35,29 +35,36 @@
     mov     x29,  sp
     str     x8,       [sp, #-16]!
 
-    /* store thread pointer & args in child stack */
+    # Align 'child_stack' to 16 bytes.
+    and     x1, x1, #~0xf
+
+    # Copy 'fn' and 'arg' onto the child stack.
     stp     x5, x6, [x1, #-16]
 
-    /* sys_clone */
+    # Zero out the top 32 bits of 'flags'. (Is this necessary?)
     uxtw    x0, w0
+
+    # Make the system call.
     mov     x8, __NR_clone
     svc     #0
 
-    /* check for child/parent */
-    cbz     x0,1f
+    # Are we the child?
+    cbz     x0, .L_bc_child
 
     ldr     x8,       [sp], #16
     ldp     x29, x30, [sp], #16
 
+    # Set errno if something went wrong.
     cmn     x0, #(MAX_ERRNO + 1)
     cneg    x0, x0, hi
     b.hi    __set_errno
 
     ret
 
-    /* thread initialization - set the end of the frame record chain */
-1:
+.L_bc_child:
+    # We're in the child now. Set the end of the frame record chain...
     mov     x29, xzr
+    # ...and call __bionic_clone_entry with the 'fn' and 'arg' we stored on the child stack.
     ldp     x0, x1, [sp, #-16]
     b       __bionic_clone_entry
 END(__bionic_clone)
diff --git a/libc/arch-arm64/bionic/_setjmp.S b/libc/arch-arm64/bionic/_setjmp.S
index dfa861b..3836899 100644
--- a/libc/arch-arm64/bionic/_setjmp.S
+++ b/libc/arch-arm64/bionic/_setjmp.S
@@ -73,7 +73,7 @@
     ldr     w9, .L_setjmp_magic
     ldr     w10, [x0, #(_JB_MAGIC * 4)]
     cmp     w9, w10
-    b.ne    botch
+    b.ne    .L_fail
 
     /* restore core registers */
     ldp     x30, x10, [x0, #(_JB_CORE_BASE * 4 + 16 * 0)]
@@ -93,10 +93,10 @@
 
     /* validate sp (sp mod 16 = 0) and lr (lr mod 4 = 0) */
     tst     x30, #3
-    b.ne    botch
+    b.ne    .L_fail
     mov     x10, sp
     tst     x10, #15
-    b.ne    botch
+    b.ne    .L_fail
 
     /* set return value */
     cmp     w1, wzr
@@ -104,7 +104,7 @@
     ret
 
     /* validation failed, die die die */
-botch:
+.L_fail:
     bl      PIC_SYM(longjmperror, PLT)
     bl      PIC_SYM(abort, PLT)
     b        . - 8       /* Cannot get here */
diff --git a/libc/arch-arm64/bionic/setjmp.S b/libc/arch-arm64/bionic/setjmp.S
index 9a68d86..f9d2266 100644
--- a/libc/arch-arm64/bionic/setjmp.S
+++ b/libc/arch-arm64/bionic/setjmp.S
@@ -85,7 +85,7 @@
     ldr     w9, .L_setjmp_magic
     ldr     w10, [x0, #(_JB_MAGIC * 4)]
     cmp     w9, w10
-    b.ne    botch
+    b.ne    .L_fail
 
     /* restore core registers */
     ldp     x30, x10, [x0, #(_JB_CORE_BASE * 4 + 16 * 0)]
@@ -105,10 +105,10 @@
 
     /* validate sp (sp mod 16 = 0) and lr (lr mod 4 = 0) */
     tst     x30, #3
-    b.ne    botch
+    b.ne    .L_fail
     mov     x10, sp
     tst     x10, #15
-    b.ne    botch
+    b.ne    .L_fail
 
     /* set return value */
     cmp     w1, wzr
@@ -116,7 +116,7 @@
     ret
 
     /* validation failed, die die die */
-botch:
+.L_fail:
     bl      PIC_SYM(longjmperror, PLT)
     bl      PIC_SYM(abort, PLT)
     b       . - 8       /* Cannot get here */