Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.

We'll need the fallbacks in any case. However, once we've
decided on how to specify the CPU features of the user
machine we can use the nicer LZCNT/TZCNT/POPCNT as well.

Adds cmov, bsf, and bsr instructions.

Calls a popcount helper function for machines without SSE4.2.

Not handling bswap yet (which can also take i16 params).

BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=stichnot@chromium.org, wala@chromium.org

Review URL: https://codereview.chromium.org/390443005
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index bba53d7..913cc5b 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -39,6 +39,13 @@
     ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
         --dir="${OUTDIR}" \
         --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+        --driver=test_bitmanip_main.cpp \
+        --output=test_bitmanip_O${optlevel}
+
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
         --test=test_cast.cpp --test=test_cast_to_u1.ll \
         --driver=test_cast_main.cpp \
         --output=test_cast_O${optlevel}
@@ -81,6 +88,7 @@
     "${OUTDIR}"/simple_loop_O${optlevel}
     "${OUTDIR}"/mem_intrin_O${optlevel}
     "${OUTDIR}"/test_arith_O${optlevel}
+    "${OUTDIR}"/test_bitmanip_O${optlevel}
     "${OUTDIR}"/test_cast_O${optlevel}
     "${OUTDIR}"/test_fcmp_O${optlevel}
     "${OUTDIR}"/test_global_O${optlevel}
diff --git a/crosstest/test_bitmanip.cpp b/crosstest/test_bitmanip.cpp
new file mode 100644
index 0000000..2ebe8a4
--- /dev/null
+++ b/crosstest/test_bitmanip.cpp
@@ -0,0 +1,40 @@
+//===- subzero/crosstest/test_bitmanip.cpp - Implementation for tests. ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test that all the bit manipulation intrinsics work, via
+// cross-testing. This calls wrappers (my_{ctlz,cttz,ctpop} around the
+// intrinsics (llvm.{ctlz,cttz,ctpop}.*).
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "test_bitmanip.h"
+
+#define X(inst, type)                                                          \
+  type test_##inst(type a) { return my_##inst(a); }                            \
+  type test_alloca_##inst(type a) {                                            \
+    const size_t buf_size = 8;                                                 \
+    type buf[buf_size];                                                        \
+    for (size_t i = 0; i < buf_size; ++i) {                                    \
+      buf[i] = my_##inst(a);                                                   \
+    }                                                                          \
+    type sum = 0;                                                              \
+    for (size_t i = 0; i < buf_size; ++i) {                                    \
+      sum += buf[i];                                                           \
+    }                                                                          \
+    return sum;                                                                \
+  }                                                                            \
+  type test_const_##inst(type ignored) {                                       \
+    return my_##inst(static_cast<type>(0x12340));                              \
+  }
+
+FOR_ALL_BMI_OP_TYPES(X)
+#undef X
diff --git a/crosstest/test_bitmanip.def b/crosstest/test_bitmanip.def
new file mode 100644
index 0000000..b164ab7
--- /dev/null
+++ b/crosstest/test_bitmanip.def
@@ -0,0 +1,42 @@
+//===- subzero/crosstest/test_bitmanip.def - macros for tests -*- C++ -*---===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing bit manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_BIT_MANIP_DEF
+#define TEST_BIT_MANIP_DEF
+
+#define STR(s) #s
+
+#define BMI_OPS  \
+  /* inst */     \
+  X(ctlz)        \
+  X(cttz)        \
+  X(ctpop)
+// #define X(inst)
+
+#define BMI_TYPES \
+  /* type */      \
+  X(uint32_t)     \
+  X(uint64_t)
+// #define X(type)
+
+#define FOR_ALL_BMI_TYPES_INST(F, inst) \
+  F(inst, uint32_t)                     \
+  F(inst, uint64_t)
+
+#define FOR_ALL_BMI_OP_TYPES(X) \
+  FOR_ALL_BMI_TYPES_INST(X, ctlz)     \
+  FOR_ALL_BMI_TYPES_INST(X, cttz)     \
+  FOR_ALL_BMI_TYPES_INST(X, ctpop)
+//#define X(inst, type)
+
+#endif // TEST_BIT_MANIP_DEF
diff --git a/crosstest/test_bitmanip.h b/crosstest/test_bitmanip.h
new file mode 100644
index 0000000..7c4efdb
--- /dev/null
+++ b/crosstest/test_bitmanip.h
@@ -0,0 +1,24 @@
+//===- subzero/crosstest/test_bitmanip.h - Test prototypes ---*- C++ -*----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing bit
+// manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_bitmanip.def"
+
+#define X(inst, type)                                                         \
+  type test_##inst(type a);                                                   \
+  type test_alloca_##inst(type a);                                            \
+  type test_const_##inst(type ignored);                                       \
+  type my_##inst(type a);
+
+FOR_ALL_BMI_OP_TYPES(X)
+#undef X
diff --git a/crosstest/test_bitmanip_intrin.ll b/crosstest/test_bitmanip_intrin.ll
new file mode 100644
index 0000000..23df538
--- /dev/null
+++ b/crosstest/test_bitmanip_intrin.ll
@@ -0,0 +1,46 @@
+; Wrappers around the bit manipulation intrinsics, which use name mangling
+; for encoding the type in the name instead of plain "C" suffixes.
+; E.g., my_ctpop(unsigned long long) vs __builtin_popcountll(...)
+; Also, normalize the intrinsic to take a single parameter when there
+; can be two, as is the case for ctlz and cttz.
+
+target triple = "i686-pc-linux-gnu"
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+
+define i32 @_Z7my_ctlzj(i32 %a) {
+  %x = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
+  ret i32 %x
+}
+
+define i64 @_Z7my_ctlzy(i64 %a) {
+  %x = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
+  ret i64 %x
+}
+
+define i32 @_Z7my_cttzj(i32 %a) {
+  %x = call i32 @llvm.cttz.i32(i32 %a, i1 0)
+  ret i32 %x
+}
+
+define i64 @_Z7my_cttzy(i64 %a) {
+  %x = call i64 @llvm.cttz.i64(i64 %a, i1 0)
+  ret i64 %x
+}
+
+define i32 @_Z8my_ctpopj(i32 %a) {
+  %x = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %x
+}
+
+define i64 @_Z8my_ctpopy(i64 %a) {
+  %x = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %x
+}
diff --git a/crosstest/test_bitmanip_main.cpp b/crosstest/test_bitmanip_main.cpp
new file mode 100644
index 0000000..b3ad585
--- /dev/null
+++ b/crosstest/test_bitmanip_main.cpp
@@ -0,0 +1,110 @@
+//===- subzero/crosstest/test_bitmanip_main.cpp - Driver for tests. -------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing bit manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+   --driver=test_bitmanip_main.cpp --prefix=Subzero_ --output=test_bitmanip */
+
+#include <stdint.h>
+
+#include <climits>
+#include <iostream>
+
+// Include test_bitmanip.h twice - once normally, and once within the
+// Subzero_ namespace, corresponding to the llc and Subzero translated
+// object files, respectively.
+#include "test_bitmanip.h"
+namespace Subzero_ {
+#include "test_bitmanip.h"
+}
+
+volatile uint64_t Values[] = {
+    0,                    1,                    0x7e,
+    0x7f,                 0x80,                 0x81,
+    0xfe,                 0xff,                 0x7ffe,
+    0x7fff,               0x8000,               0x8001,
+    0xfffe,               0xffff,
+    0x007fffff /*Max subnormal + */,
+    0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */,
+    0x7f800000 /*+Inf*/,  0xff800000 /*-Inf*/,
+    0x7fa00000 /*SNaN*/,  0x7fc00000 /*QNaN*/,
+    0x7ffffffe,           0x7fffffff,           0x80000000,
+    0x80000001,           0xfffffffe,           0xffffffff,
+    0x100000000ll,        0x100000001ll,
+    0x000fffffffffffffll /*Max subnormal + */,
+    0x0010000000000000ll /*Min+ */,
+    0x7fefffffffffffffll /*Max+ */,
+    0x7ff0000000000000ll /*+Inf*/,
+    0xfff0000000000000ll /*-Inf*/,
+    0x7ff0000000000001ll /*SNaN*/,
+    0x7ff8000000000000ll /*QNaN*/,
+    0x7ffffffffffffffell, 0x7fffffffffffffffll, 0x8000000000000000ll,
+    0x8000000000000001ll, 0xfffffffffffffffell, 0xffffffffffffffffll };
+
+const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+
+template <typename Type>
+void testBitManip(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef Type (*FuncType)(Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(inst)                                                             \
+  {                                                                         \
+    STR(inst), test_##inst, Subzero_::test_##inst                           \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst   \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_const", test_const_##inst, Subzero_::test_const_##inst      \
+  },
+      BMI_OPS
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValues; ++i) {
+      Type Value = static_cast<Type>(Values[i]);
+      ++TotalTests;
+      Type ResultSz = Funcs[f].FuncSz(Value);
+      Type ResultLlc = Funcs[f].FuncLlc(Value);
+      if (ResultSz == ResultLlc) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "test_" << Funcs[f].Name
+                  << (CHAR_BIT * sizeof(Type)) << "("
+                  << static_cast<uint64_t>(Value)
+                  << "): sz=" << static_cast<uint64_t>(ResultSz)
+                  << " llc=" << static_cast<uint64_t>(ResultLlc)
+                  << "\n";
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testBitManip<uint32_t>(TotalTests, Passes, Failures);
+  testBitManip<uint64_t>(TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index c0e8c8d..57fb179 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -94,6 +94,11 @@
   addSource(Source2);
 }
 
+InstX8632Neg::InstX8632Neg(Cfg *Func, Operand *SrcDest)
+    : InstX8632(Func, InstX8632::Neg, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+  addSource(SrcDest);
+}
+
 InstX8632Shld::InstX8632Shld(Cfg *Func, Variable *Dest, Variable *Source1,
                              Variable *Source2)
     : InstX8632(Func, InstX8632::Shld, 3, Dest) {
@@ -121,7 +126,7 @@
 }
 
 InstX8632Br::InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
-                         InstX8632Label *Label, InstX8632Br::BrCond Condition)
+                         InstX8632Label *Label, InstX8632::BrCond Condition)
     : InstX8632(Func, InstX8632::Br, 0, NULL), Condition(Condition),
       TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label) {}
 
@@ -139,6 +144,15 @@
   addSource(Source);
 }
 
+InstX8632Cmov::InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source,
+                             InstX8632::BrCond Condition)
+    : InstX8632(Func, InstX8632::Cmov, 2, Dest), Condition(Condition) {
+  // The final result is either the original Dest, or Source, so mark
+  // both as sources.
+  addSource(Dest);
+  addSource(Source);
+}
+
 InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
                                    Variable *Eax, Variable *Desired,
                                    bool Locked)
@@ -297,11 +311,6 @@
   return false;
 }
 
-InstX8632Sqrtss::InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source)
-    : InstX8632(Func, InstX8632::Sqrtss, 1, Dest) {
-  addSource(Source);
-}
-
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
     : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
   if (Source)
@@ -429,7 +438,9 @@
   Str << "\n";
 }
 
-template <> const char *InstX8632Neg::Opcode = "neg";
+template <> const char *InstX8632Bsf::Opcode = "bsf";
+template <> const char *InstX8632Bsr::Opcode = "bsr";
+template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
 template <> const char *InstX8632Add::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
@@ -453,6 +464,18 @@
 template <> const char *InstX8632Shr::Opcode = "shr";
 template <> const char *InstX8632Sar::Opcode = "sar";
 
+template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Type Ty = getSrc(0)->getType();
+  assert(Ty == IceType_f32 || Ty == IceType_f64);
+  Str << "\tsqrt" << TypeX8632Attributes[Ty].SdSsString << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
 template <> void InstX8632Addss::emit(const Cfg *Func) const {
   char buf[30];
   snprintf(buf, llvm::array_lengthof(buf), "add%s",
@@ -523,6 +546,21 @@
   dumpSources(Func);
 }
 
+void InstX8632Neg::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Str << "\tneg\t";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Neg::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = neg." << getDest()->getType() << " ";
+  dumpSources(Func);
+}
+
 void InstX8632Shld::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
@@ -586,6 +624,27 @@
   dumpSources(Func);
 }
 
+void InstX8632Cmov::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t";
+  assert(Condition != Br_None);
+  assert(getDest()->hasReg());
+  Str << "cmov" << InstX8632BrAttributes[Condition].DisplayString << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmov::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "cmov" << InstX8632BrAttributes[Condition].DisplayString << ".";
+  Str << getDest()->getType() << " ";
+  dumpDest(Func);
+  Str << ", ";
+  dumpSources(Func);
+}
+
 void InstX8632Cmpxchg::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
@@ -1007,25 +1066,6 @@
   dumpSources(Func);
 }
 
-void InstX8632Sqrtss::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 1);
-  Type Ty = getSrc(0)->getType();
-  assert(Ty == IceType_f32 || Ty == IceType_f64);
-  Str << "\tsqrt" << TypeX8632Attributes[Ty].SdSsString << "\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(0)->emit(Func);
-  Str << "\n";
-}
-
-void InstX8632Sqrtss::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = sqrt." << getDest()->getType() << " ";
-  dumpSources(Func);
-}
-
 void InstX8632Xadd::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   if (Locked) {
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 25beb6d..3f40614 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -139,8 +139,11 @@
     Addss,
     And,
     Br,
+    Bsf,
+    Bsr,
     Call,
     Cdq,
+    Cmov,
     Cmpxchg,
     Cmpxchg8b,
     Cvt,
@@ -188,6 +191,14 @@
     Xchg,
     Xor
   };
+
+  enum BrCond {
+#define X(tag, dump, emit) tag,
+    ICEINSTX8632BR_TABLE
+#undef X
+        Br_None
+  };
+
   static const char *getWidthString(Type Ty);
   virtual void emit(const Cfg *Func) const = 0;
   virtual void dump(const Cfg *Func) const;
@@ -262,13 +273,6 @@
 // Conditional and unconditional branch instruction.
 class InstX8632Br : public InstX8632 {
 public:
-  enum BrCond {
-#define X(tag, dump, emit) tag,
-    ICEINSTX8632BR_TABLE
-#undef X
-        Br_None
-  };
-
   // Create a conditional branch to a node.
   static InstX8632Br *create(Cfg *Func, CfgNode *TargetTrue,
                              CfgNode *TargetFalse, BrCond Condition) {
@@ -334,16 +338,16 @@
 template <InstX8632::InstKindX8632 K>
 class InstX8632Unaryop : public InstX8632 {
 public:
-  // Create an unary-op instruction like neg.
-  // The source and dest are the same variable.
-  static InstX8632Unaryop *create(Cfg *Func, Operand *SrcDest) {
+  static InstX8632Unaryop *create(Cfg *Func, Variable *Dest, Operand *Src) {
     return new (Func->allocate<InstX8632Unaryop>())
-        InstX8632Unaryop(Func, SrcDest);
+        InstX8632Unaryop(Func, Dest, Src);
   }
   virtual void emit(const Cfg *Func) const {
     Ostream &Str = Func->getContext()->getStrEmit();
     assert(getSrcSize() == 1);
     Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
     getSrc(0)->emit(Func);
     Str << "\n";
   }
@@ -356,9 +360,9 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstX8632Unaryop(Cfg *Func, Operand *SrcDest)
-      : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
-    addSource(SrcDest);
+  InstX8632Unaryop(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstX8632(Func, K, 1, Dest) {
+    addSource(Src);
   }
   InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
   InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
@@ -438,7 +442,9 @@
   static const char *Opcode;
 };
 
-typedef InstX8632Unaryop<InstX8632::Neg> InstX8632Neg;
+typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
+typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
+typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
 typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
@@ -503,6 +509,23 @@
   virtual ~InstX8632Mul() {}
 };
 
+// Neg instruction - Two's complement negation.
+class InstX8632Neg : public InstX8632 {
+public:
+  static InstX8632Neg *create(Cfg *Func, Operand *SrcDest) {
+    return new (Func->allocate<InstX8632Neg>()) InstX8632Neg(Func, SrcDest);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Neg); }
+
+private:
+  InstX8632Neg(Cfg *Func, Operand *SrcDest);
+  InstX8632Neg(const InstX8632Neg &) LLVM_DELETED_FUNCTION;
+  InstX8632Neg &operator=(const InstX8632Neg &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Neg() {}
+};
+
 // Shld instruction - shift across a pair of operands.  TODO: Verify
 // that the validator accepts the shld instruction.
 class InstX8632Shld : public InstX8632 {
@@ -563,6 +586,27 @@
   virtual ~InstX8632Cdq() {}
 };
 
+// Conditional move instruction.
+class InstX8632Cmov : public InstX8632 {
+public:
+  static InstX8632Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
+                               BrCond Cond) {
+    return new (Func->allocate<InstX8632Cmov>())
+        InstX8632Cmov(Func, Dest, Source, Cond);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmov); }
+
+private:
+  InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
+  InstX8632Cmov(const InstX8632Cmov &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmov &operator=(const InstX8632Cmov &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmov() {}
+
+  BrCond Condition;
+};
+
 // Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
 // equals eax. If so, the ZF is set and <desired> is stored in <dest>.
 // If not, ZF is cleared and <dest> is copied to eax (or subregister).
@@ -948,24 +992,6 @@
   virtual ~InstX8632Ret() {}
 };
 
-// Sqrtss - Scalar sqrt of a float or double.
-class InstX8632Sqrtss : public InstX8632 {
-public:
-  static InstX8632Sqrtss *create(Cfg *Func, Variable *Dest, Operand *Source) {
-    return new (Func->allocate<InstX8632Sqrtss>())
-        InstX8632Sqrtss(Func, Dest, Source);
-  }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Sqrtss); }
-
-private:
-  InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source);
-  InstX8632Sqrtss(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
-  InstX8632Sqrtss &operator=(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Sqrtss() {}
-};
-
 // Exchanging Add instruction.  Exchanges the first operand (destination
 // operand) with the second operand (source operand), then loads the sum
 // of the two values into the destination operand. The destination may be
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 38b6fc6..45c3151 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -39,7 +39,7 @@
 const struct TableFcmp_ {
   uint32_t Default;
   bool SwapOperands;
-  InstX8632Br::BrCond C1, C2;
+  InstX8632::BrCond C1, C2;
 } TableFcmp[] = {
 #define X(val, dflt, swap, C1, C2)                                             \
   { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 }                             \
@@ -54,7 +54,7 @@
 // x86 conditional branch instruction.
 
 const struct TableIcmp32_ {
-  InstX8632Br::BrCond Mapping;
+  InstX8632::BrCond Mapping;
 } TableIcmp32[] = {
 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
   { InstX8632Br::C_32 }                                                        \
@@ -69,7 +69,7 @@
 // conditional branches are needed.  For the other conditions, three separate
 // conditional branches are needed.
 const struct TableIcmp64_ {
-  InstX8632Br::BrCond C1, C2, C3;
+  InstX8632::BrCond C1, C2, C3;
 } TableIcmp64[] = {
 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
   { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 }               \
@@ -79,7 +79,7 @@
   };
 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
 
-InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
+InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
   size_t Index = static_cast<size_t>(Cond);
   assert(Index < TableIcmp32Size);
   return TableIcmp32[Index].Mapping;
@@ -2109,12 +2109,61 @@
     return;
   }
   case Intrinsics::Bswap:
-  case Intrinsics::Ctlz:
-  case Intrinsics::Ctpop:
-  case Intrinsics::Cttz:
-    // TODO(jvoung): fill it in.
     Func->setError("Unhandled intrinsic");
     return;
+  case Intrinsics::Ctpop: {
+    Variable *Dest = Instr->getDest();
+    Operand *Val = Instr->getArg(0);
+    InstCall *Call = makeHelperCall(Val->getType() == IceType_i64 ?
+        "__popcountdi2" : "__popcountsi2", Dest, 1);
+    Call->addArg(Val);
+    lowerCall(Call);
+    // The popcount helpers always return 32-bit values, while the intrinsic's
+    // signature matches the native POPCNT instruction and fills a 64-bit reg
+    // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
+    // the user doesn't do that in the IR. If the user does that in the IR,
+    // then this zero'ing instruction is dead and gets optimized out.
+    if (Val->getType() == IceType_i64) {
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      Constant *Zero = Ctx->getConstantZero(IceType_i32);
+      _mov(DestHi, Zero);
+    }
+    return;
+  }
+  case Intrinsics::Ctlz: {
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = NULL;
+    if (Val->getType() == IceType_i64) {
+      FirstVal = loOperand(Val);
+      SecondVal = hiOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    const bool IsCttz = false;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
+  case Intrinsics::Cttz: {
+    // The "is zero undef" parameter is ignored and we always return
+    // a well-defined value.
+    Operand *Val = legalize(Instr->getArg(0));
+    Operand *FirstVal;
+    Operand *SecondVal = NULL;
+    if (Val->getType() == IceType_i64) {
+      FirstVal = hiOperand(Val);
+      SecondVal = loOperand(Val);
+    } else {
+      FirstVal = Val;
+    }
+    const bool IsCttz = true;
+    lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+                    SecondVal);
+    return;
+  }
   case Intrinsics::Longjmp: {
     InstCall *Call = makeHelperCall("longjmp", NULL, 2);
     Call->addArg(Instr->getArg(0));
@@ -2408,6 +2457,81 @@
   _mov(Dest, T_eax);
 }
 
+// Lowers count {trailing, leading} zeros intrinsic.
+//
+// We could do constant folding here, but that should have
+// been done by the front-end/middle-end optimizations.
+void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
+                                  Operand *FirstVal, Operand *SecondVal) {
+  // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
+  // Then the instructions will handle the Val == 0 case much more simply
+  // and won't require conversion from bit position to number of zeros.
+  //
+  // Otherwise:
+  //   bsr IF_NOT_ZERO, Val
+  //   mov T_DEST, 63
+  //   cmovne T_DEST, IF_NOT_ZERO
+  //   xor T_DEST, 31
+  //   mov DEST, T_DEST
+  //
+  // NOTE: T_DEST must be a register because cmov requires its dest to be a
+  // register. Also, bsf and bsr require their dest to be a register.
+  //
+  // The xor DEST, 31 converts a bit position to # of leading zeroes.
+  // E.g., for 000... 00001100, bsr will say that the most significant bit
+  // set is at position 3, while the number of leading zeros is 28. Xor is
+  // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
+  //
+  // Similar for 64-bit, but start w/ speculating that the upper 32 bits
+  // are all zero, and compute the result for that case (checking the lower
+  // 32 bits). Then actually compute the result for the upper bits and
+  // cmov in the result from the lower computation if the earlier speculation
+  // was correct.
+  //
+  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
+  // bit position conversion, and the speculation is reversed.
+  assert(Ty == IceType_i32 || Ty == IceType_i64);
+  Variable *T = makeReg(IceType_i32);
+  if (Cttz) {
+    _bsf(T, FirstVal);
+  } else {
+    _bsr(T, FirstVal);
+  }
+  Variable *T_Dest = makeReg(IceType_i32);
+  Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);
+  Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);
+  if (Cttz) {
+    _mov(T_Dest, ThirtyTwo);
+  } else {
+    Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);
+    _mov(T_Dest, SixtyThree);
+  }
+  _cmov(T_Dest, T, InstX8632::Br_ne);
+  if (!Cttz) {
+    _xor(T_Dest, ThirtyOne);
+  }
+  if (Ty == IceType_i32) {
+    _mov(Dest, T_Dest);
+    return;
+  }
+  _add(T_Dest, ThirtyTwo);
+  Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+  Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+  // Will be using "test" on this, so we need a registerized variable.
+  Variable *SecondVar = legalizeToVar(SecondVal);
+  Variable *T_Dest2 = makeReg(IceType_i32);
+  if (Cttz) {
+    _bsf(T_Dest2, SecondVar);
+  } else {
+    _bsr(T_Dest2, SecondVar);
+    _xor(T_Dest2, ThirtyOne);
+  }
+  _test(SecondVar, SecondVar);
+  _cmov(T_Dest2, T_Dest, InstX8632::Br_e);
+  _mov(DestLo, T_Dest2);
+  _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+}
+
 namespace {
 
 bool isAdd(const Inst *Inst) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 1408873..6e21781 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -99,6 +99,8 @@
                           Operand *Desired);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
+  void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
+                       Operand *SecondVal);
 
   typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
   void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
@@ -164,7 +166,7 @@
   void _and(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632And::create(Func, Dest, Src0));
   }
-  void _br(InstX8632Br::BrCond Condition, CfgNode *TargetTrue,
+  void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue,
            CfgNode *TargetFalse) {
     Context.insert(
         InstX8632Br::create(Func, TargetTrue, TargetFalse, Condition));
@@ -172,15 +174,24 @@
   void _br(CfgNode *Target) {
     Context.insert(InstX8632Br::create(Func, Target));
   }
-  void _br(InstX8632Br::BrCond Condition, CfgNode *Target) {
+  void _br(InstX8632::BrCond Condition, CfgNode *Target) {
     Context.insert(InstX8632Br::create(Func, Target, Condition));
   }
-  void _br(InstX8632Br::BrCond Condition, InstX8632Label *Label) {
+  void _br(InstX8632::BrCond Condition, InstX8632Label *Label) {
     Context.insert(InstX8632Br::create(Func, Label, Condition));
   }
+  void _bsf(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Bsf::create(Func, Dest, Src0));
+  }
+  void _bsr(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Bsr::create(Func, Dest, Src0));
+  }
   void _cdq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Cdq::create(Func, Dest, Src0));
   }
+  void _cmov(Variable *Dest, Operand *Src0, InstX8632::BrCond Condition) {
+    Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));
+  }
   void _cmp(Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
   }
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
index 5ed776b..cebf923 100644
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -1,10 +1,14 @@
 ; This tests the NaCl intrinsics not related to atomic operations.
 
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 
+; Do another run w/ O2 and a different check-prefix (otherwise O2 and Om1
+; share the same "CHECK" prefix). This separate run helps check that
+; some code is optimized out.
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
+
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
 ; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
 ; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
 ; RUN:                           | FileCheck --check-prefix=DUMP %s
@@ -18,6 +22,12 @@
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)
 declare void @llvm.trap()
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
 
 define i32 @test_nacl_read_tp() {
 entry:
@@ -232,5 +242,128 @@
 ; CHECK-LABEL: test_trap
 ; CHECK: ud2
 
+define i32 @test_ctlz_32(i32 %x) {
+entry:
+  %r = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %r
+}
+; CHECK-LABEL: test_ctlz_32
+; TODO(jvoung): If we detect that LZCNT is supported, then use that
+; and avoid the need to do the cmovne and xor stuff to guarantee that
+; the result is well-defined w/ input == 0.
+; CHECK: bsr [[REG_TMP:e.*]], {{.*}}
+; CHECK: mov [[REG_RES:e.*]], 63
+; CHECK: cmovne [[REG_RES]], [[REG_TMP]]
+; CHECK: xor [[REG_RES]], 31
+
+define i32 @test_ctlz_32_const() {
+entry:
+  %r = call i32 @llvm.ctlz.i32(i32 123456, i1 0)
+  ret i32 %r
+}
+; Could potentially constant fold this, but the front-end should have done that.
+; CHECK-LABEL: test_ctlz_32_const
+; CHECK: bsr
+
+define i32 @test_ctlz_32_ignored(i32 %x) {
+entry:
+  %ignored = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 1
+}
+; CHECKO2REM-LABEL: test_ctlz_32_ignored
+; CHECKO2REM-NOT: bsr
+
+define i64 @test_ctlz_64(i64 %x) {
+entry:
+  %r = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+  ret i64 %r
+}
+; CHECKO2REM-LABEL: test_ctlz_64
+; CHECK-LABEL: test_ctlz_64
+; CHECK: bsr [[REG_TMP1:e.*]], {{.*}}
+; CHECK: mov [[REG_RES1:e.*]], 63
+; CHECK: cmovne [[REG_RES1]], [[REG_TMP1]]
+; CHECK: xor [[REG_RES1]], 31
+; CHECK: add [[REG_RES1]], 32
+; CHECK: bsr [[REG_RES2:e.*]], {{.*}}
+; CHECK: xor [[REG_RES2]], 31
+; CHECK: test [[REG_UPPER:.*]], [[REG_UPPER]]
+; CHECK: cmove [[REG_RES2]], [[REG_RES1]]
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_ctlz_64_const(i64 %x) {
+entry:
+  %r = call i64 @llvm.ctlz.i64(i64 123456789012, i1 0)
+  %r2 = trunc i64 %r to i32
+  ret i32 %r2
+}
+; CHECK-LABEL: test_ctlz_64_const
+; CHECK: bsr
+; CHECK: bsr
+
+define i32 @test_ctlz_64_ignored(i64 %x) {
+entry:
+  %ignored = call i64 @llvm.ctlz.i64(i64 1234567890, i1 0)
+  ret i32 2
+}
+; CHECKO2REM-LABEL: test_ctlz_64_ignored
+; CHECKO2REM-NOT: bsr
+
+define i32 @test_cttz_32(i32 %x) {
+entry:
+  %r = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %r
+}
+; CHECK-LABEL: test_cttz_32
+; CHECK: bsf [[REG_IF_NOTZERO:e.*]], {{.*}}
+; CHECK: mov [[REG_IF_ZERO:e.*]], 32
+; CHECK: cmovne [[REG_IF_ZERO]], [[REG_IF_NOTZERO]]
+
+define i64 @test_cttz_64(i64 %x) {
+entry:
+  %r = call i64 @llvm.cttz.i64(i64 %x, i1 0)
+  ret i64 %r
+}
+; CHECK-LABEL: test_cttz_64
+; CHECK: bsf [[REG_IF_NOTZERO:e.*]], {{.*}}
+; CHECK: mov [[REG_RES1:e.*]], 32
+; CHECK: cmovne [[REG_RES1]], [[REG_IF_NOTZERO]]
+; CHECK: add [[REG_RES1]], 32
+; CHECK: bsf [[REG_RES2:e.*]], [[REG_LOWER:.*]]
+; CHECK: test [[REG_LOWER]], [[REG_LOWER]]
+; CHECK: cmove [[REG_RES2]], [[REG_RES1]]
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_popcount_32(i32 %x) {
+entry:
+  %r = call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %r
+}
+; CHECK-LABEL: test_popcount_32
+; CHECK: call __popcountsi2
+
+define i64 @test_popcount_64(i64 %x) {
+entry:
+  %r = call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %r
+}
+; CHECK-LABEL: test_popcount_64
+; CHECK: call __popcountdi2
+; __popcountdi2 only returns a 32-bit result, so clear the upper bits of
+; the return value just in case.
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_popcount_64_ret_i32(i64 %x) {
+entry:
+  %r_i64 = call i64 @llvm.ctpop.i64(i64 %x)
+  %r = trunc i64 %r_i64 to i32
+  ret i32 %r
+}
+; If there is a trunc, then the mov {{.*}}, 0 is dead and gets optimized out.
+; CHECKO2REM-LABEL: test_popcount_64_ret_i32
+; CHECKO2REM: call __popcountdi2
+; CHECKO2REM-NOT: mov {{.*}}, 0
+
+
 ; ERRORS-NOT: ICE translation error
 ; DUMP-NOT: SZ