Add stack unwinding directives to assembly leaf functions.

So that the real culprit of native crashes can surface in the stack trace.
diff --git a/libc/arch-arm/bionic/atomics_arm.S b/libc/arch-arm/bionic/atomics_arm.S
index b2da09f..f8b23e6 100644
--- a/libc/arch-arm/bionic/atomics_arm.S
+++ b/libc/arch-arm/bionic/atomics_arm.S
@@ -41,6 +41,8 @@
    .equ     kernel_cmpxchg, 0xFFFF0FC0
    .equ     kernel_atomic_base, 0xFFFF0FFF
 __atomic_dec:
+    .fnstart
+    .save {r4, lr}
     stmdb   sp!, {r4, lr}
     mov     r2, r0
 1: @ atomic_dec
@@ -53,8 +55,11 @@
     add     r0, r1, #1
     ldmia   sp!, {r4, lr}
     bx      lr
+    .fnend
 
 __atomic_inc:
+    .fnstart
+    .save {r4, lr}
     stmdb   sp!, {r4, lr}
     mov     r2, r0
 1: @ atomic_inc
@@ -67,9 +72,12 @@
     sub     r0, r1, #1
     ldmia   sp!, {r4, lr}
     bx      lr
+    .fnend
 
 /* r0(old) r1(new) r2(addr) -> r0(zero_if_succeeded) */
 __atomic_cmpxchg:
+    .fnstart
+    .save {r4, lr}
     stmdb   sp!, {r4, lr}
     mov     r4, r0          /* r4 = save oldvalue */
 1: @ atomic_cmpxchg
@@ -84,6 +92,7 @@
 2: @ atomic_cmpxchg
     ldmia   sp!, {r4, lr}
     bx      lr
+    .fnend
 #else
 #define KUSER_CMPXCHG 0xffffffc0
 
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index f45b56b..67dcddc 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -44,6 +44,7 @@
  */
 
 memcmp:
+        .fnstart
         PLD         (r0, #0)
         PLD         (r1, #0)
 
@@ -53,6 +54,7 @@
         moveq       r0, #0
         bxeq        lr
 
+        .save {r4, lr}
         /* save registers */
         stmfd       sp!, {r4, lr}
         
@@ -174,6 +176,7 @@
 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr
+        .fnend
 
 
 
diff --git a/libc/arch-arm/bionic/memcmp16.S b/libc/arch-arm/bionic/memcmp16.S
index 38d8b62..f398588 100644
--- a/libc/arch-arm/bionic/memcmp16.S
+++ b/libc/arch-arm/bionic/memcmp16.S
@@ -44,6 +44,7 @@
  */
 
 __memcmp16:
+        .fnstart
         PLD         (r0, #0)
         PLD         (r1, #0)
 
@@ -79,6 +80,7 @@
         bx          lr
 
 
+        .save {r4, lr}
         /* save registers */
 0:      stmfd       sp!, {r4, lr}
         
@@ -93,6 +95,7 @@
         /* restore registers and return */
         ldmnefd     sp!, {r4, lr}
         bxne        lr
+        .fnend
 
 
 
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 97331d3..4ea2c6d 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -40,6 +40,7 @@
 #define NEON_MAX_PREFETCH_DISTANCE 320
 
 memcpy:
+        .fnstart
 		mov	ip, r0
 		cmp	r2, #16
 		blt     4f	@ Have less than 16 bytes to copy
@@ -128,6 +129,7 @@
 		ldrmib	r3, [r1], #1
 		strmib	r3, [ip], #1
 		bx	lr
+        .fnend
 
 #else	/* __ARM_ARCH__ < 7 */