Fixes for the ARM-specific bswap_16, bswap_32, and bswap_64.

1. Make the feature test work by excluding known-deficient processors, so
we don't have to maintain a complete list of all the processors that support
REV and REV16.

2. Don't abuse 'register' to get an effect similar to GCC's +l constraint,
but which was unnecessarily restrictive.

3. Fix __swap64md so _x isn't clobbered, breaking 64-bit swaps.

4. Make <byteswap.h> (which declars bswap_16 and friends) use <endian.h>
rather than <sys/endian.h>, so we get the machine-dependent implementations.

Change-Id: I6a38fad7a9fbe394aff141489617eb3883e1e944
diff --git a/libc/arch-arm/include/endian.h b/libc/arch-arm/include/endian.h
index e34872f..6de0889 100644
--- a/libc/arch-arm/include/endian.h
+++ b/libc/arch-arm/include/endian.h
@@ -33,14 +33,14 @@
 
 #ifdef __GNUC__
 
-/* NOTE: header <machine/cpu-features.h> could not be included directly
- * since it defines extra macros, such as PLD.
+/*
+ * REV and REV16 weren't available on ARM5 or ARM4.
+ * We don't include <machine/cpu-features.h> because it pollutes the
+ * namespace with macros like PLD.
  */
-#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) ||	\
-    defined(__ARM_ARCH_7__) ||					\
-    defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || 	\
-    defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || 	\
-    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
+#if !defined __ARM_ARCH_5__ && !defined __ARM_ARCH_5T__ && \
+    !defined __ARM_ARCH_5TE__ && !defined __ARM_ARCH_5TEJ__ && \
+    !defined __ARM_ARCH_4T__ && !defined __ARM_ARCH_4__
 
 /* According to RealView Assembler User's Guide, REV and REV16 are available
  * in Thumb code and 16-bit instructions when used in Thumb-2 code.
@@ -50,44 +50,40 @@
  *
  * REV16 Rd, Rm
  *   Rd and Rm must both be Lo registers.
+ *
+ * The +l constraint takes care of this without constraining us in ARM mode.
  */
-#ifdef __thumb__
-#define REV_LO_REG	asm("r4")
-#else
-#define REV_LO_REG
-#endif
-
-#define __swap16md(x) ({						\
-	register u_int16_t _x REV_LO_REG = (x);				\
-	__asm volatile ("rev16 %0, %0" : "+r" (_x));			\
-	_x;								\
+#define __swap16md(x) ({                                        \
+    register u_int16_t _x = (x);                                \
+    __asm volatile ("rev16 %0, %0" : "+l" (_x));                \
+    _x;                                                         \
 })
 
-#define __swap32md(x) ({						\
-	register u_int32_t _x REV_LO_REG = (x);				\
-	__asm volatile ("rev %0, %0" : "+r" (_x));			\
-	_x;								\
+#define __swap32md(x) ({                                        \
+    register u_int32_t _x = (x);                                \
+    __asm volatile ("rev %0, %0" : "+l" (_x));                  \
+    _x;                                                         \
 })
 
-#define __swap64md(x) ({						\
-	u_int64_t _x = (x);						\
-	(u_int64_t) __swap32md(_x >> 32) |				\
-	(u_int64_t) __swap32md(_x & 0xffffffff) << 32;			\
+#define __swap64md(x) ({                                        \
+    u_int64_t _swap64md_x = (x);                                \
+    (u_int64_t) __swap32md(_swap64md_x >> 32) |                 \
+        (u_int64_t) __swap32md(_swap64md_x & 0xffffffff) << 32; \
 })
 
 /* Tell sys/endian.h we have MD variants of the swap macros.  */
 #define MD_SWAP
 
-#endif	/* __ARM_ARCH__ */
-#endif	/* __GNUC__ */
+#endif  /* __ARM_ARCH__ */
+#endif  /* __GNUC__ */
 
 #ifdef __ARMEB__
 #define _BYTE_ORDER _BIG_ENDIAN
 #else
 #define _BYTE_ORDER _LITTLE_ENDIAN
 #endif
-#define	__STRICT_ALIGNMENT
+#define __STRICT_ALIGNMENT
 #include <sys/types.h>
 #include <sys/endian.h>
 
-#endif	/* !_ARM_ENDIAN_H_ */
+#endif  /* !_ARM_ENDIAN_H_ */
diff --git a/libc/include/byteswap.h b/libc/include/byteswap.h
index 16d2ad4..74b0e91 100644
--- a/libc/include/byteswap.h
+++ b/libc/include/byteswap.h
@@ -28,7 +28,8 @@
 #ifndef _BYTESWAP_H_
 #define _BYTESWAP_H_
 
-#include <sys/endian.h>
+/* endian.h rather than sys/endian.h so we get the machine-specific file. */
+#include <endian.h>
 
 #define  bswap_16(x)   swap16(x)
 #define  bswap_32(x)   swap32(x)