Align arm64 stacks to 16 bytes in __bionic_clone.

Also ensure that arm64/x86-64/x86 assembler uses local labels.
(There are are so many non-local labels in arm that fixing them
seems out of scope.)

Also synchronize the __bionic_clone.S comments.

Change-Id: I03b4f84780d996b54d6637a074638196bbb01cd4
diff --git a/libc/arch-arm64/bionic/setjmp.S b/libc/arch-arm64/bionic/setjmp.S
index 9a68d86..f9d2266 100644
--- a/libc/arch-arm64/bionic/setjmp.S
+++ b/libc/arch-arm64/bionic/setjmp.S
@@ -85,7 +85,7 @@
     ldr     w9, .L_setjmp_magic
     ldr     w10, [x0, #(_JB_MAGIC * 4)]
     cmp     w9, w10
-    b.ne    botch
+    b.ne    .L_fail
 
     /* restore core registers */
     ldp     x30, x10, [x0, #(_JB_CORE_BASE * 4 + 16 * 0)]
@@ -105,10 +105,10 @@
 
     /* validate sp (sp mod 16 = 0) and lr (lr mod 4 = 0) */
     tst     x30, #3
-    b.ne    botch
+    b.ne    .L_fail
     mov     x10, sp
     tst     x10, #15
-    b.ne    botch
+    b.ne    .L_fail
 
     /* set return value */
     cmp     w1, wzr
@@ -116,7 +116,7 @@
     ret
 
     /* validation failed, die die die */
-botch:
+.L_fail:
     bl      PIC_SYM(longjmperror, PLT)
     bl      PIC_SYM(abort, PLT)
     b       . - 8       /* Cannot get here */