Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.
We'll need the fallbacks in any case. However, once we've
decided on how to specify the CPU features of the user
machine we can use the nicer LZCNT/TZCNT/POPCNT as well.
Adds cmov, bsf, and bsr instructions.
Calls a popcount helper function for machines without SSE4.2.
Not handling bswap yet (which can also take i16 params).
BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=stichnot@chromium.org, wala@chromium.org
Review URL: https://codereview.chromium.org/390443005
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index bba53d7..913cc5b 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -39,6 +39,13 @@
./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
--dir="${OUTDIR}" \
--llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+ --driver=test_bitmanip_main.cpp \
+ --output=test_bitmanip_O${optlevel}
+
+ ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
--test=test_cast.cpp --test=test_cast_to_u1.ll \
--driver=test_cast_main.cpp \
--output=test_cast_O${optlevel}
@@ -81,6 +88,7 @@
"${OUTDIR}"/simple_loop_O${optlevel}
"${OUTDIR}"/mem_intrin_O${optlevel}
"${OUTDIR}"/test_arith_O${optlevel}
+ "${OUTDIR}"/test_bitmanip_O${optlevel}
"${OUTDIR}"/test_cast_O${optlevel}
"${OUTDIR}"/test_fcmp_O${optlevel}
"${OUTDIR}"/test_global_O${optlevel}
diff --git a/crosstest/test_bitmanip.cpp b/crosstest/test_bitmanip.cpp
new file mode 100644
index 0000000..2ebe8a4
--- /dev/null
+++ b/crosstest/test_bitmanip.cpp
@@ -0,0 +1,40 @@
+//===- subzero/crosstest/test_bitmanip.cpp - Implementation for tests. ----===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test that all the bit manipulation intrinsics work, via
+// cross-testing. This calls wrappers (my_{ctlz,cttz,ctpop} around the
+// intrinsics (llvm.{ctlz,cttz,ctpop}.*).
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "test_bitmanip.h"
+
+#define X(inst, type) \
+ type test_##inst(type a) { return my_##inst(a); } \
+ type test_alloca_##inst(type a) { \
+ const size_t buf_size = 8; \
+ type buf[buf_size]; \
+ for (size_t i = 0; i < buf_size; ++i) { \
+ buf[i] = my_##inst(a); \
+ } \
+ type sum = 0; \
+ for (size_t i = 0; i < buf_size; ++i) { \
+ sum += buf[i]; \
+ } \
+ return sum; \
+ } \
+ type test_const_##inst(type ignored) { \
+ return my_##inst(static_cast<type>(0x12340)); \
+ }
+
+FOR_ALL_BMI_OP_TYPES(X)
+#undef X
diff --git a/crosstest/test_bitmanip.def b/crosstest/test_bitmanip.def
new file mode 100644
index 0000000..b164ab7
--- /dev/null
+++ b/crosstest/test_bitmanip.def
@@ -0,0 +1,42 @@
+//===- subzero/crosstest/test_bitmanip.def - macros for tests -*- C++ -*---===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing bit manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_BIT_MANIP_DEF
+#define TEST_BIT_MANIP_DEF
+
+#define STR(s) #s
+
+#define BMI_OPS \
+ /* inst */ \
+ X(ctlz) \
+ X(cttz) \
+ X(ctpop)
+// #define X(inst)
+
+#define BMI_TYPES \
+ /* type */ \
+ X(uint32_t) \
+ X(uint64_t)
+// #define X(type)
+
+#define FOR_ALL_BMI_TYPES_INST(F, inst) \
+ F(inst, uint32_t) \
+ F(inst, uint64_t)
+
+#define FOR_ALL_BMI_OP_TYPES(X) \
+ FOR_ALL_BMI_TYPES_INST(X, ctlz) \
+ FOR_ALL_BMI_TYPES_INST(X, cttz) \
+ FOR_ALL_BMI_TYPES_INST(X, ctpop)
+//#define X(inst, type)
+
+#endif // TEST_BIT_MANIP_DEF
diff --git a/crosstest/test_bitmanip.h b/crosstest/test_bitmanip.h
new file mode 100644
index 0000000..7c4efdb
--- /dev/null
+++ b/crosstest/test_bitmanip.h
@@ -0,0 +1,24 @@
+//===- subzero/crosstest/test_bitmanip.h - Test prototypes ---*- C++ -*----===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing bit
+// manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_bitmanip.def"
+
+#define X(inst, type) \
+ type test_##inst(type a); \
+ type test_alloca_##inst(type a); \
+ type test_const_##inst(type ignored); \
+ type my_##inst(type a);
+
+FOR_ALL_BMI_OP_TYPES(X)
+#undef X
diff --git a/crosstest/test_bitmanip_intrin.ll b/crosstest/test_bitmanip_intrin.ll
new file mode 100644
index 0000000..23df538
--- /dev/null
+++ b/crosstest/test_bitmanip_intrin.ll
@@ -0,0 +1,46 @@
+; Wrappers around the bit manipulation intrinsics, which use name mangling
+; for encoding the type in the name instead of plain "C" suffixes.
+; E.g., my_ctpop(unsigned long long) vs __builtin_popcountll(...)
+; Also, normalize the intrinsic to take a single parameter when there
+; can be two, as is the case for ctlz and cttz.
+
+target triple = "i686-pc-linux-gnu"
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+
+define i32 @_Z7my_ctlzj(i32 %a) {
+ %x = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
+ ret i32 %x
+}
+
+define i64 @_Z7my_ctlzy(i64 %a) {
+ %x = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
+ ret i64 %x
+}
+
+define i32 @_Z7my_cttzj(i32 %a) {
+ %x = call i32 @llvm.cttz.i32(i32 %a, i1 0)
+ ret i32 %x
+}
+
+define i64 @_Z7my_cttzy(i64 %a) {
+ %x = call i64 @llvm.cttz.i64(i64 %a, i1 0)
+ ret i64 %x
+}
+
+define i32 @_Z8my_ctpopj(i32 %a) {
+ %x = call i32 @llvm.ctpop.i32(i32 %a)
+ ret i32 %x
+}
+
+define i64 @_Z8my_ctpopy(i64 %a) {
+ %x = call i64 @llvm.ctpop.i64(i64 %a)
+ ret i64 %x
+}
diff --git a/crosstest/test_bitmanip_main.cpp b/crosstest/test_bitmanip_main.cpp
new file mode 100644
index 0000000..b3ad585
--- /dev/null
+++ b/crosstest/test_bitmanip_main.cpp
@@ -0,0 +1,110 @@
+//===- subzero/crosstest/test_bitmanip_main.cpp - Driver for tests. -------===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing bit manipulation intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_bitmanip.cpp --test=test_bitmanip_intrin.ll \
+ --driver=test_bitmanip_main.cpp --prefix=Subzero_ --output=test_bitmanip */
+
+#include <stdint.h>
+
+#include <climits>
+#include <iostream>
+
+// Include test_bitmanip.h twice - once normally, and once within the
+// Subzero_ namespace, corresponding to the llc and Subzero translated
+// object files, respectively.
+#include "test_bitmanip.h"
+namespace Subzero_ {
+#include "test_bitmanip.h"
+}
+
+volatile uint64_t Values[] = {
+ 0, 1, 0x7e,
+ 0x7f, 0x80, 0x81,
+ 0xfe, 0xff, 0x7ffe,
+ 0x7fff, 0x8000, 0x8001,
+ 0xfffe, 0xffff,
+ 0x007fffff /*Max subnormal + */,
+ 0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */,
+ 0x7f800000 /*+Inf*/, 0xff800000 /*-Inf*/,
+ 0x7fa00000 /*SNaN*/, 0x7fc00000 /*QNaN*/,
+ 0x7ffffffe, 0x7fffffff, 0x80000000,
+ 0x80000001, 0xfffffffe, 0xffffffff,
+ 0x100000000ll, 0x100000001ll,
+ 0x000fffffffffffffll /*Max subnormal + */,
+ 0x0010000000000000ll /*Min+ */,
+ 0x7fefffffffffffffll /*Max+ */,
+ 0x7ff0000000000000ll /*+Inf*/,
+ 0xfff0000000000000ll /*-Inf*/,
+ 0x7ff0000000000001ll /*SNaN*/,
+ 0x7ff8000000000000ll /*QNaN*/,
+ 0x7ffffffffffffffell, 0x7fffffffffffffffll, 0x8000000000000000ll,
+ 0x8000000000000001ll, 0xfffffffffffffffell, 0xffffffffffffffffll };
+
+const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+
+template <typename Type>
+void testBitManip(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef Type (*FuncType)(Type);
+ static struct {
+ const char *Name;
+ FuncType FuncLlc;
+ FuncType FuncSz;
+ } Funcs[] = {
+#define X(inst) \
+ { \
+ STR(inst), test_##inst, Subzero_::test_##inst \
+ }, \
+ { \
+ STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst \
+ }, \
+ { \
+ STR(inst) "_const", test_const_##inst, Subzero_::test_const_##inst \
+ },
+ BMI_OPS
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ for (size_t i = 0; i < NumValues; ++i) {
+ Type Value = static_cast<Type>(Values[i]);
+ ++TotalTests;
+ Type ResultSz = Funcs[f].FuncSz(Value);
+ Type ResultLlc = Funcs[f].FuncLlc(Value);
+ if (ResultSz == ResultLlc) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "test_" << Funcs[f].Name
+ << (CHAR_BIT * sizeof(Type)) << "("
+ << static_cast<uint64_t>(Value)
+ << "): sz=" << static_cast<uint64_t>(ResultSz)
+ << " llc=" << static_cast<uint64_t>(ResultLlc)
+ << "\n";
+ }
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ size_t TotalTests = 0;
+ size_t Passes = 0;
+ size_t Failures = 0;
+
+ testBitManip<uint32_t>(TotalTests, Passes, Failures);
+ testBitManip<uint64_t>(TotalTests, Passes, Failures);
+
+ std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+ << " Failures=" << Failures << "\n";
+ return Failures;
+}
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index c0e8c8d..57fb179 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -94,6 +94,11 @@
addSource(Source2);
}
+InstX8632Neg::InstX8632Neg(Cfg *Func, Operand *SrcDest)
+ : InstX8632(Func, InstX8632::Neg, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+ addSource(SrcDest);
+}
+
InstX8632Shld::InstX8632Shld(Cfg *Func, Variable *Dest, Variable *Source1,
Variable *Source2)
: InstX8632(Func, InstX8632::Shld, 3, Dest) {
@@ -121,7 +126,7 @@
}
InstX8632Br::InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
- InstX8632Label *Label, InstX8632Br::BrCond Condition)
+ InstX8632Label *Label, InstX8632::BrCond Condition)
: InstX8632(Func, InstX8632::Br, 0, NULL), Condition(Condition),
TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label) {}
@@ -139,6 +144,15 @@
addSource(Source);
}
+InstX8632Cmov::InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source,
+ InstX8632::BrCond Condition)
+ : InstX8632(Func, InstX8632::Cmov, 2, Dest), Condition(Condition) {
+ // The final result is either the original Dest, or Source, so mark
+ // both as sources.
+ addSource(Dest);
+ addSource(Source);
+}
+
InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
Variable *Eax, Variable *Desired,
bool Locked)
@@ -297,11 +311,6 @@
return false;
}
-InstX8632Sqrtss::InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source)
- : InstX8632(Func, InstX8632::Sqrtss, 1, Dest) {
- addSource(Source);
-}
-
InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
: InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
if (Source)
@@ -429,7 +438,9 @@
Str << "\n";
}
-template <> const char *InstX8632Neg::Opcode = "neg";
+template <> const char *InstX8632Bsf::Opcode = "bsf";
+template <> const char *InstX8632Bsr::Opcode = "bsr";
+template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
template <> const char *InstX8632Add::Opcode = "add";
template <> const char *InstX8632Addps::Opcode = "addps";
template <> const char *InstX8632Adc::Opcode = "adc";
@@ -453,6 +464,18 @@
template <> const char *InstX8632Shr::Opcode = "shr";
template <> const char *InstX8632Sar::Opcode = "sar";
+template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 1);
+ Type Ty = getSrc(0)->getType();
+ assert(Ty == IceType_f32 || Ty == IceType_f64);
+ Str << "\tsqrt" << TypeX8632Attributes[Ty].SdSsString << "\t";
+ getDest()->emit(Func);
+ Str << ", ";
+ getSrc(0)->emit(Func);
+ Str << "\n";
+}
+
template <> void InstX8632Addss::emit(const Cfg *Func) const {
char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "add%s",
@@ -523,6 +546,21 @@
dumpSources(Func);
}
+void InstX8632Neg::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 1);
+ Str << "\tneg\t";
+ getSrc(0)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Neg::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ dumpDest(Func);
+ Str << " = neg." << getDest()->getType() << " ";
+ dumpSources(Func);
+}
+
void InstX8632Shld::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3);
@@ -586,6 +624,27 @@
dumpSources(Func);
}
+void InstX8632Cmov::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ Str << "\t";
+ assert(Condition != Br_None);
+ assert(getDest()->hasReg());
+ Str << "cmov" << InstX8632BrAttributes[Condition].DisplayString << "\t";
+ getDest()->emit(Func);
+ Str << ", ";
+ getSrc(1)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Cmov::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ Str << "cmov" << InstX8632BrAttributes[Condition].DisplayString << ".";
+ Str << getDest()->getType() << " ";
+ dumpDest(Func);
+ Str << ", ";
+ dumpSources(Func);
+}
+
void InstX8632Cmpxchg::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3);
@@ -1007,25 +1066,6 @@
dumpSources(Func);
}
-void InstX8632Sqrtss::emit(const Cfg *Func) const {
- Ostream &Str = Func->getContext()->getStrEmit();
- assert(getSrcSize() == 1);
- Type Ty = getSrc(0)->getType();
- assert(Ty == IceType_f32 || Ty == IceType_f64);
- Str << "\tsqrt" << TypeX8632Attributes[Ty].SdSsString << "\t";
- getDest()->emit(Func);
- Str << ", ";
- getSrc(0)->emit(Func);
- Str << "\n";
-}
-
-void InstX8632Sqrtss::dump(const Cfg *Func) const {
- Ostream &Str = Func->getContext()->getStrDump();
- dumpDest(Func);
- Str << " = sqrt." << getDest()->getType() << " ";
- dumpSources(Func);
-}
-
void InstX8632Xadd::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
if (Locked) {
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 25beb6d..3f40614 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -139,8 +139,11 @@
Addss,
And,
Br,
+ Bsf,
+ Bsr,
Call,
Cdq,
+ Cmov,
Cmpxchg,
Cmpxchg8b,
Cvt,
@@ -188,6 +191,14 @@
Xchg,
Xor
};
+
+ enum BrCond {
+#define X(tag, dump, emit) tag,
+ ICEINSTX8632BR_TABLE
+#undef X
+ Br_None
+ };
+
static const char *getWidthString(Type Ty);
virtual void emit(const Cfg *Func) const = 0;
virtual void dump(const Cfg *Func) const;
@@ -262,13 +273,6 @@
// Conditional and unconditional branch instruction.
class InstX8632Br : public InstX8632 {
public:
- enum BrCond {
-#define X(tag, dump, emit) tag,
- ICEINSTX8632BR_TABLE
-#undef X
- Br_None
- };
-
// Create a conditional branch to a node.
static InstX8632Br *create(Cfg *Func, CfgNode *TargetTrue,
CfgNode *TargetFalse, BrCond Condition) {
@@ -334,16 +338,16 @@
template <InstX8632::InstKindX8632 K>
class InstX8632Unaryop : public InstX8632 {
public:
- // Create an unary-op instruction like neg.
- // The source and dest are the same variable.
- static InstX8632Unaryop *create(Cfg *Func, Operand *SrcDest) {
+ static InstX8632Unaryop *create(Cfg *Func, Variable *Dest, Operand *Src) {
return new (Func->allocate<InstX8632Unaryop>())
- InstX8632Unaryop(Func, SrcDest);
+ InstX8632Unaryop(Func, Dest, Src);
}
virtual void emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 1);
Str << "\t" << Opcode << "\t";
+ getDest()->emit(Func);
+ Str << ", ";
getSrc(0)->emit(Func);
Str << "\n";
}
@@ -356,9 +360,9 @@
static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
private:
- InstX8632Unaryop(Cfg *Func, Operand *SrcDest)
- : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
- addSource(SrcDest);
+ InstX8632Unaryop(Cfg *Func, Variable *Dest, Operand *Src)
+ : InstX8632(Func, K, 1, Dest) {
+ addSource(Src);
}
InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
@@ -438,7 +442,9 @@
static const char *Opcode;
};
-typedef InstX8632Unaryop<InstX8632::Neg> InstX8632Neg;
+typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
+typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
+typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
@@ -503,6 +509,23 @@
virtual ~InstX8632Mul() {}
};
+// Neg instruction - Two's complement negation.
+class InstX8632Neg : public InstX8632 {
+public:
+ static InstX8632Neg *create(Cfg *Func, Operand *SrcDest) {
+ return new (Func->allocate<InstX8632Neg>()) InstX8632Neg(Func, SrcDest);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Neg); }
+
+private:
+ InstX8632Neg(Cfg *Func, Operand *SrcDest);
+ InstX8632Neg(const InstX8632Neg &) LLVM_DELETED_FUNCTION;
+ InstX8632Neg &operator=(const InstX8632Neg &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Neg() {}
+};
+
// Shld instruction - shift across a pair of operands. TODO: Verify
// that the validator accepts the shld instruction.
class InstX8632Shld : public InstX8632 {
@@ -563,6 +586,27 @@
virtual ~InstX8632Cdq() {}
};
+// Conditional move instruction.
+class InstX8632Cmov : public InstX8632 {
+public:
+ static InstX8632Cmov *create(Cfg *Func, Variable *Dest, Operand *Source,
+ BrCond Cond) {
+ return new (Func->allocate<InstX8632Cmov>())
+ InstX8632Cmov(Func, Dest, Source, Cond);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Cmov); }
+
+private:
+ InstX8632Cmov(Cfg *Func, Variable *Dest, Operand *Source, BrCond Cond);
+ InstX8632Cmov(const InstX8632Cmov &) LLVM_DELETED_FUNCTION;
+ InstX8632Cmov &operator=(const InstX8632Cmov &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Cmov() {}
+
+ BrCond Condition;
+};
+
// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
// equals eax. If so, the ZF is set and <desired> is stored in <dest>.
// If not, ZF is cleared and <dest> is copied to eax (or subregister).
@@ -948,24 +992,6 @@
virtual ~InstX8632Ret() {}
};
-// Sqrtss - Scalar sqrt of a float or double.
-class InstX8632Sqrtss : public InstX8632 {
-public:
- static InstX8632Sqrtss *create(Cfg *Func, Variable *Dest, Operand *Source) {
- return new (Func->allocate<InstX8632Sqrtss>())
- InstX8632Sqrtss(Func, Dest, Source);
- }
- virtual void emit(const Cfg *Func) const;
- virtual void dump(const Cfg *Func) const;
- static bool classof(const Inst *Inst) { return isClassof(Inst, Sqrtss); }
-
-private:
- InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source);
- InstX8632Sqrtss(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
- InstX8632Sqrtss &operator=(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
- virtual ~InstX8632Sqrtss() {}
-};
-
// Exchanging Add instruction. Exchanges the first operand (destination
// operand) with the second operand (source operand), then loads the sum
// of the two values into the destination operand. The destination may be
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 38b6fc6..45c3151 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -39,7 +39,7 @@
const struct TableFcmp_ {
uint32_t Default;
bool SwapOperands;
- InstX8632Br::BrCond C1, C2;
+ InstX8632::BrCond C1, C2;
} TableFcmp[] = {
#define X(val, dflt, swap, C1, C2) \
{ dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \
@@ -54,7 +54,7 @@
// x86 conditional branch instruction.
const struct TableIcmp32_ {
- InstX8632Br::BrCond Mapping;
+ InstX8632::BrCond Mapping;
} TableIcmp32[] = {
#define X(val, C_32, C1_64, C2_64, C3_64) \
{ InstX8632Br::C_32 } \
@@ -69,7 +69,7 @@
// conditional branches are needed. For the other conditions, three separate
// conditional branches are needed.
const struct TableIcmp64_ {
- InstX8632Br::BrCond C1, C2, C3;
+ InstX8632::BrCond C1, C2, C3;
} TableIcmp64[] = {
#define X(val, C_32, C1_64, C2_64, C3_64) \
{ InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \
@@ -79,7 +79,7 @@
};
const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
-InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
+InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
size_t Index = static_cast<size_t>(Cond);
assert(Index < TableIcmp32Size);
return TableIcmp32[Index].Mapping;
@@ -2109,12 +2109,61 @@
return;
}
case Intrinsics::Bswap:
- case Intrinsics::Ctlz:
- case Intrinsics::Ctpop:
- case Intrinsics::Cttz:
- // TODO(jvoung): fill it in.
Func->setError("Unhandled intrinsic");
return;
+ case Intrinsics::Ctpop: {
+ Variable *Dest = Instr->getDest();
+ Operand *Val = Instr->getArg(0);
+ InstCall *Call = makeHelperCall(Val->getType() == IceType_i64 ?
+ "__popcountdi2" : "__popcountsi2", Dest, 1);
+ Call->addArg(Val);
+ lowerCall(Call);
+ // The popcount helpers always return 32-bit values, while the intrinsic's
+ // signature matches the native POPCNT instruction and fills a 64-bit reg
+ // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
+ // the user doesn't do that in the IR. If the user does that in the IR,
+ // then this zero'ing instruction is dead and gets optimized out.
+ if (Val->getType() == IceType_i64) {
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+ Constant *Zero = Ctx->getConstantZero(IceType_i32);
+ _mov(DestHi, Zero);
+ }
+ return;
+ }
+ case Intrinsics::Ctlz: {
+ // The "is zero undef" parameter is ignored and we always return
+ // a well-defined value.
+ Operand *Val = legalize(Instr->getArg(0));
+ Operand *FirstVal;
+ Operand *SecondVal = NULL;
+ if (Val->getType() == IceType_i64) {
+ FirstVal = loOperand(Val);
+ SecondVal = hiOperand(Val);
+ } else {
+ FirstVal = Val;
+ }
+ const bool IsCttz = false;
+ lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+ SecondVal);
+ return;
+ }
+ case Intrinsics::Cttz: {
+ // The "is zero undef" parameter is ignored and we always return
+ // a well-defined value.
+ Operand *Val = legalize(Instr->getArg(0));
+ Operand *FirstVal;
+ Operand *SecondVal = NULL;
+ if (Val->getType() == IceType_i64) {
+ FirstVal = hiOperand(Val);
+ SecondVal = loOperand(Val);
+ } else {
+ FirstVal = Val;
+ }
+ const bool IsCttz = true;
+ lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
+ SecondVal);
+ return;
+ }
case Intrinsics::Longjmp: {
InstCall *Call = makeHelperCall("longjmp", NULL, 2);
Call->addArg(Instr->getArg(0));
@@ -2408,6 +2457,81 @@
_mov(Dest, T_eax);
}
+// Lowers count {trailing, leading} zeros intrinsic.
+//
+// We could do constant folding here, but that should have
+// been done by the front-end/middle-end optimizations.
+void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
+ Operand *FirstVal, Operand *SecondVal) {
+ // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
+ // Then the instructions will handle the Val == 0 case much more simply
+ // and won't require conversion from bit position to number of zeros.
+ //
+ // Otherwise:
+ // bsr IF_NOT_ZERO, Val
+ // mov T_DEST, 63
+ // cmovne T_DEST, IF_NOT_ZERO
+ // xor T_DEST, 31
+ // mov DEST, T_DEST
+ //
+ // NOTE: T_DEST must be a register because cmov requires its dest to be a
+ // register. Also, bsf and bsr require their dest to be a register.
+ //
+ // The xor DEST, 31 converts a bit position to # of leading zeroes.
+ // E.g., for 000... 00001100, bsr will say that the most significant bit
+ // set is at position 3, while the number of leading zeros is 28. Xor is
+ // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
+ //
+ // Similar for 64-bit, but start w/ speculating that the upper 32 bits
+ // are all zero, and compute the result for that case (checking the lower
+ // 32 bits). Then actually compute the result for the upper bits and
+ // cmov in the result from the lower computation if the earlier speculation
+ // was correct.
+ //
+ // Cttz, is similar, but uses bsf instead, and doesn't require the xor
+ // bit position conversion, and the speculation is reversed.
+ assert(Ty == IceType_i32 || Ty == IceType_i64);
+ Variable *T = makeReg(IceType_i32);
+ if (Cttz) {
+ _bsf(T, FirstVal);
+ } else {
+ _bsr(T, FirstVal);
+ }
+ Variable *T_Dest = makeReg(IceType_i32);
+ Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);
+ Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);
+ if (Cttz) {
+ _mov(T_Dest, ThirtyTwo);
+ } else {
+ Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);
+ _mov(T_Dest, SixtyThree);
+ }
+ _cmov(T_Dest, T, InstX8632::Br_ne);
+ if (!Cttz) {
+ _xor(T_Dest, ThirtyOne);
+ }
+ if (Ty == IceType_i32) {
+ _mov(Dest, T_Dest);
+ return;
+ }
+ _add(T_Dest, ThirtyTwo);
+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+ // Will be using "test" on this, so we need a registerized variable.
+ Variable *SecondVar = legalizeToVar(SecondVal);
+ Variable *T_Dest2 = makeReg(IceType_i32);
+ if (Cttz) {
+ _bsf(T_Dest2, SecondVar);
+ } else {
+ _bsr(T_Dest2, SecondVar);
+ _xor(T_Dest2, ThirtyOne);
+ }
+ _test(SecondVar, SecondVar);
+ _cmov(T_Dest2, T_Dest, InstX8632::Br_e);
+ _mov(DestLo, T_Dest2);
+ _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+}
+
namespace {
bool isAdd(const Inst *Inst) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 1408873..6e21781 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -99,6 +99,8 @@
Operand *Desired);
void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
Operand *Val);
+ void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
+ Operand *SecondVal);
typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
@@ -164,7 +166,7 @@
void _and(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632And::create(Func, Dest, Src0));
}
- void _br(InstX8632Br::BrCond Condition, CfgNode *TargetTrue,
+ void _br(InstX8632::BrCond Condition, CfgNode *TargetTrue,
CfgNode *TargetFalse) {
Context.insert(
InstX8632Br::create(Func, TargetTrue, TargetFalse, Condition));
@@ -172,15 +174,24 @@
void _br(CfgNode *Target) {
Context.insert(InstX8632Br::create(Func, Target));
}
- void _br(InstX8632Br::BrCond Condition, CfgNode *Target) {
+ void _br(InstX8632::BrCond Condition, CfgNode *Target) {
Context.insert(InstX8632Br::create(Func, Target, Condition));
}
- void _br(InstX8632Br::BrCond Condition, InstX8632Label *Label) {
+ void _br(InstX8632::BrCond Condition, InstX8632Label *Label) {
Context.insert(InstX8632Br::create(Func, Label, Condition));
}
+ void _bsf(Variable *Dest, Operand *Src0) {
+ Context.insert(InstX8632Bsf::create(Func, Dest, Src0));
+ }
+ void _bsr(Variable *Dest, Operand *Src0) {
+ Context.insert(InstX8632Bsr::create(Func, Dest, Src0));
+ }
void _cdq(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Cdq::create(Func, Dest, Src0));
}
+ void _cmov(Variable *Dest, Operand *Src0, InstX8632::BrCond Condition) {
+ Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));
+ }
void _cmp(Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
}
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
index 5ed776b..cebf923 100644
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -1,10 +1,14 @@
; This tests the NaCl intrinsics not related to atomic operations.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; Do another run w/ O2 and a different check-prefix (otherwise O2 and Om1
+; share the same "CHECK" prefix). This separate run helps check that
+; some code is optimized out.
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
+
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
; RUN: | FileCheck --check-prefix=DUMP %s
@@ -18,6 +22,12 @@
declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
declare void @llvm.trap()
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
define i32 @test_nacl_read_tp() {
entry:
@@ -232,5 +242,128 @@
; CHECK-LABEL: test_trap
; CHECK: ud2
+define i32 @test_ctlz_32(i32 %x) {
+entry:
+ %r = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %r
+}
+; CHECK-LABEL: test_ctlz_32
+; TODO(jvoung): If we detect that LZCNT is supported, then use that
+; and avoid the need to do the cmovne and xor stuff to guarantee that
+; the result is well-defined w/ input == 0.
+; CHECK: bsr [[REG_TMP:e.*]], {{.*}}
+; CHECK: mov [[REG_RES:e.*]], 63
+; CHECK: cmovne [[REG_RES]], [[REG_TMP]]
+; CHECK: xor [[REG_RES]], 31
+
+define i32 @test_ctlz_32_const() {
+entry:
+ %r = call i32 @llvm.ctlz.i32(i32 123456, i1 0)
+ ret i32 %r
+}
+; Could potentially constant fold this, but the front-end should have done that.
+; CHECK-LABEL: test_ctlz_32_const
+; CHECK: bsr
+
+define i32 @test_ctlz_32_ignored(i32 %x) {
+entry:
+ %ignored = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 1
+}
+; CHECKO2REM-LABEL: test_ctlz_32_ignored
+; CHECKO2REM-NOT: bsr
+
+define i64 @test_ctlz_64(i64 %x) {
+entry:
+ %r = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+ ret i64 %r
+}
+; CHECKO2REM-LABEL: test_ctlz_64
+; CHECK-LABEL: test_ctlz_64
+; CHECK: bsr [[REG_TMP1:e.*]], {{.*}}
+; CHECK: mov [[REG_RES1:e.*]], 63
+; CHECK: cmovne [[REG_RES1]], [[REG_TMP1]]
+; CHECK: xor [[REG_RES1]], 31
+; CHECK: add [[REG_RES1]], 32
+; CHECK: bsr [[REG_RES2:e.*]], {{.*}}
+; CHECK: xor [[REG_RES2]], 31
+; CHECK: test [[REG_UPPER:.*]], [[REG_UPPER]]
+; CHECK: cmove [[REG_RES2]], [[REG_RES1]]
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_ctlz_64_const(i64 %x) {
+entry:
+ %r = call i64 @llvm.ctlz.i64(i64 123456789012, i1 0)
+ %r2 = trunc i64 %r to i32
+ ret i32 %r2
+}
+; CHECK-LABEL: test_ctlz_64_const
+; CHECK: bsr
+; CHECK: bsr
+
+define i32 @test_ctlz_64_ignored(i64 %x) {
+entry:
+ %ignored = call i64 @llvm.ctlz.i64(i64 1234567890, i1 0)
+ ret i32 2
+}
+; CHECKO2REM-LABEL: test_ctlz_64_ignored
+; CHECKO2REM-NOT: bsr
+
+define i32 @test_cttz_32(i32 %x) {
+entry:
+ %r = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+ ret i32 %r
+}
+; CHECK-LABEL: test_cttz_32
+; CHECK: bsf [[REG_IF_NOTZERO:e.*]], {{.*}}
+; CHECK: mov [[REG_IF_ZERO:e.*]], 32
+; CHECK: cmovne [[REG_IF_ZERO]], [[REG_IF_NOTZERO]]
+
+define i64 @test_cttz_64(i64 %x) {
+entry:
+ %r = call i64 @llvm.cttz.i64(i64 %x, i1 0)
+ ret i64 %r
+}
+; CHECK-LABEL: test_cttz_64
+; CHECK: bsf [[REG_IF_NOTZERO:e.*]], {{.*}}
+; CHECK: mov [[REG_RES1:e.*]], 32
+; CHECK: cmovne [[REG_RES1]], [[REG_IF_NOTZERO]]
+; CHECK: add [[REG_RES1]], 32
+; CHECK: bsf [[REG_RES2:e.*]], [[REG_LOWER:.*]]
+; CHECK: test [[REG_LOWER]], [[REG_LOWER]]
+; CHECK: cmove [[REG_RES2]], [[REG_RES1]]
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_popcount_32(i32 %x) {
+entry:
+ %r = call i32 @llvm.ctpop.i32(i32 %x)
+ ret i32 %r
+}
+; CHECK-LABEL: test_popcount_32
+; CHECK: call __popcountsi2
+
+define i64 @test_popcount_64(i64 %x) {
+entry:
+ %r = call i64 @llvm.ctpop.i64(i64 %x)
+ ret i64 %r
+}
+; CHECK-LABEL: test_popcount_64
+; CHECK: call __popcountdi2
+; __popcountdi2 only returns a 32-bit result, so clear the upper bits of
+; the return value just in case.
+; CHECK: mov {{.*}}, 0
+
+define i32 @test_popcount_64_ret_i32(i64 %x) {
+entry:
+ %r_i64 = call i64 @llvm.ctpop.i64(i64 %x)
+ %r = trunc i64 %r_i64 to i32
+ ret i32 %r
+}
+; If there is a trunc, then the mov {{.*}}, 0 is dead and gets optimized out.
+; CHECKO2REM-LABEL: test_popcount_64_ret_i32
+; CHECKO2REM: call __popcountdi2
+; CHECKO2REM-NOT: mov {{.*}}, 0
+
+
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ