Code drop from //branches/cupcake/...@124589
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index d19dfb9..f45b56b 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -25,6 +25,9 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
+#include <machine/cpu-features.h>
+
     .text
 
     .global memcmp
@@ -41,8 +44,8 @@
  */
 
 memcmp:
-        pld         [r0, #0]
-        pld         [r1, #0]
+        PLD         (r0, #0)
+        PLD         (r1, #0)
 
         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
@@ -53,8 +56,8 @@
         /* save registers */
         stmfd       sp!, {r4, lr}
         
-        pld         [r0, #32]
-        pld         [r1, #32]
+        PLD         (r0, #32)
+        PLD         (r1, #32)
 
         /* since r0 hold the result, move the first source
          * pointer somewhere else
@@ -104,8 +107,8 @@
         subs        r2, r2, #(32 + 4)
         bmi         1f
         
-0:      pld         [r4, #64]
-        pld         [r1, #64]
+0:      PLD         (r4, #64)
+        PLD         (r1, #64)
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
@@ -192,8 +195,8 @@
         bic         r1, r1, #3
         ldr         lr, [r1], #4
 
-6:      pld         [r1, #64]
-        pld         [r4, #64]
+6:      PLD         (r1, #64)
+        PLD         (r4, #64)
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4
diff --git a/libc/arch-arm/bionic/memcmp16.S b/libc/arch-arm/bionic/memcmp16.S
index c6e6d39..38d8b62 100644
--- a/libc/arch-arm/bionic/memcmp16.S
+++ b/libc/arch-arm/bionic/memcmp16.S
@@ -25,6 +25,9 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
+#include <machine/cpu-features.h>
+
     .text
 
     .global __memcmp16
@@ -41,8 +44,8 @@
  */
 
 __memcmp16:
-        pld         [r0, #0]
-        pld         [r1, #0]
+        PLD         (r0, #0)
+        PLD         (r1, #0)
 
         /* take of the case where length is nul or the buffers are the same */
         cmp         r0, r1
@@ -64,8 +67,8 @@
         bpl         0f
 
         /* small blocks (less then 12 words) */
-        pld         [r0, #32]
-        pld         [r1, #32]
+        PLD         (r0, #32)
+        PLD         (r1, #32)
 
 1:      ldrh        r0, [r3], #2
         ldrh        ip, [r1], #2
@@ -113,8 +116,8 @@
         bmi         1f
         
 0:
-        pld         [r3, #64]
-        pld         [r1, #64]
+        PLD         (r3, #64)
+        PLD         (r1, #64)
         ldr         r0, [r3], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
@@ -195,8 +198,8 @@
         sub         r2, r2, #8
 
 6:
-        pld         [r3, #64]
-        pld         [r1, #64]
+        PLD         (r3, #64)
+        PLD         (r1, #64)
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r3], #4
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index f6e4a7d..fcb58cd 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -25,6 +25,9 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
+#include <machine/cpu-features.h>
+
 	.text
 
     .global memcpy
@@ -52,9 +55,9 @@
 
         // preload the destination because we'll align it to a cache line
         // with small writes. Also start the source "pump".
-        pld         [r0, #0]
-        pld         [r1, #0]
-        pld         [r1, #32]
+        PLD         (r0, #0)
+        PLD         (r1, #0)
+        PLD         (r1, #32)
 
 		/* it simplifies things to take care of len<4 early */
 		cmp			r2, #4
@@ -141,8 +144,8 @@
         bic         r12, r1, #0x1F
         add         r12, r12, #64
 
-1:      ldmia		r1!, { r4-r11 }
-        pld         [r12, #64]
+1:      ldmia       r1!, { r4-r11 }
+        PLD         (r12, #64)
         subs        r2, r2, #32
 
         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
@@ -263,8 +266,8 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
-		subs		r2, r2, #32
+        PLD         (r1, #64)
+        subs        r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #16
 		mov			r4, r4,			lsr #16
@@ -290,7 +293,7 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
+        PLD         (r1, #64)
 		subs		r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #24
@@ -317,7 +320,7 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
+        PLD         (r1, #64)
 		subs		r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #8
diff --git a/libc/arch-arm/bionic/strlen.c b/libc/arch-arm/bionic/strlen.c
index 3d1fe45..01632e3 100644
--- a/libc/arch-arm/bionic/strlen.c
+++ b/libc/arch-arm/bionic/strlen.c
@@ -27,6 +27,7 @@
  */
 #include <string.h>
 #include <stdint.h>
+#include <machine/cpu-features.h>
 
 size_t strlen(const char *s)
 {
@@ -62,7 +63,9 @@
         "ldr     %[v], [ %[s] ], #4         \n"
         "sub     %[l], %[l], %[s]           \n"
         "0:                                 \n"
+#if __ARM_HAVE_PLD
         "pld     [ %[s], #64 ]              \n"
+#endif
         "sub     %[t], %[v], %[mask], lsr #7\n"
         "and     %[t], %[t], %[mask]        \n"
         "bics    %[t], %[t], %[v]           \n"