Lower the fcmp instruction for <4 x float> operands.
Most fcmp conditions map directly to single x86 instructions. For
these, the lowering is table driven.
BUG=none
R=jvoung@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/413053002
diff --git a/crosstest/test_fcmp.pnacl.ll b/crosstest/test_fcmp.pnacl.ll
index 7c4d42e..a175b1d 100644
--- a/crosstest/test_fcmp.pnacl.ll
+++ b/crosstest/test_fcmp.pnacl.ll
@@ -1,7 +1,7 @@
target triple = "i686-pc-linux-gnu"
-; This file is extracted from fp.pnacl.ll in the lit tests, with
-; the "internal" attribute removed from the functions.
+; This file is extracted from fp.pnacl.ll and vector-fcmp.ll in the lit
+; tests, with the "internal" attribute removed from the functions.
define i32 @fcmpFalseFloat(float %a, float %b) {
entry:
@@ -322,3 +322,151 @@
}
; CHECK: fcmpTrueDouble:
; CHECK: mov {{.*}}, 1
+
+define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp false <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpFalseVector:
+; CHECK: pxor
+}
+
+define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp oeq <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOeqVector:
+; CHECK: cmpeqps
+}
+
+define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp oge <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgeVector:
+; CHECK: cmpleps
+}
+
+define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ogt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgtVector:
+; CHECK: cmpltps
+}
+
+define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ole <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOleVector:
+; CHECK: cmpleps
+}
+
+define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp olt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOltVector:
+; CHECK: cmpltps
+}
+
+define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp one <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOneVector:
+; CHECK: cmpneqps
+; CHECK: cmpordps
+; CHECK: pand
+}
+
+define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ord <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOrdVector:
+; CHECK: cmpordps
+}
+
+define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp true <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpTrueVector:
+; CHECK: pcmpeqd
+}
+
+define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ueq <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUeqVector:
+; CHECK: cmpeqps
+; CHECK: cmpunordps
+; CHECK: por
+}
+
+define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp uge <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgeVector:
+; CHECK: cmpnltps
+}
+
+define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ugt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgtVector:
+; CHECK: cmpnleps
+}
+
+define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ule <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUleVector:
+; CHECK: cmpnltps
+}
+
+define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ult <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUltVector:
+; CHECK: cmpnleps
+}
+
+define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp une <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUneVector:
+; CHECK: cmpneqps
+}
+
+define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp uno <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUnoVector:
+; CHECK: cmpunordps
+}
diff --git a/crosstest/test_fcmp_main.cpp b/crosstest/test_fcmp_main.cpp
index 0c98c0a..f04b2e5 100644
--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
@@ -1,22 +1,42 @@
+//===- subzero/crosstest/test_fcmp_main.cpp - Driver for tests ------------===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing the fcmp bitcode instruction
+//
+//===----------------------------------------------------------------------===//
+
/* crosstest.py --test=test_fcmp.pnacl.ll --driver=test_fcmp_main.cpp \
--prefix=Subzero_ --output=test_fcmp */
#include <cassert>
#include <cfloat>
#include <cmath>
+#include <cstring>
#include <iostream>
+#include "vectors.h"
#include "test_fcmp.def"
#define X(cmp) \
extern "C" bool fcmp##cmp##Float(float a, float b); \
extern "C" bool fcmp##cmp##Double(double a, double b); \
+ extern "C" v4si32 fcmp##cmp##Vector(v4f32 a, v4f32 b); \
extern "C" bool Subzero_fcmp##cmp##Float(float a, float b); \
- extern "C" bool Subzero_fcmp##cmp##Double(double a, double b);
+ extern "C" bool Subzero_fcmp##cmp##Double(double a, double b); \
+ extern "C" v4si32 Subzero_fcmp##cmp##Vector(v4f32 a, v4f32 b);
FCMP_TABLE;
#undef X
-int main(int argc, char **argv) {
+volatile double *Values;
+size_t NumValues;
+
+void initializeValues() {
static const double NegInf = -1.0 / 0.0;
static const double Zero = 0.0;
static const double Ten = 10.0;
@@ -30,12 +50,14 @@
assert(NegInf < Zero);
assert(NegInf < PosInf);
assert(Zero < PosInf);
+ static volatile double InitValues[] = {NegInf, -Zero, Zero, DBL_MIN,
+ FLT_MIN, Ten, FLT_MAX, DBL_MAX,
+ PosInf, Nan, NegNan};
+ NumValues = sizeof(InitValues) / sizeof(*InitValues);
+ Values = InitValues;
+}
- volatile double Values[] = { NegInf, -Zero, Zero, DBL_MIN, FLT_MIN,
- Ten, FLT_MAX, DBL_MAX, PosInf, Nan,
- NegNan };
- const static size_t NumValues = sizeof(Values) / sizeof(*Values);
-
+void testsScalar(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef bool (*FuncTypeFloat)(float, float);
typedef bool (*FuncTypeDouble)(double, double);
static struct {
@@ -58,9 +80,7 @@
bool ResultSz, ResultLlc;
- size_t TotalTests = 0;
- size_t Passes = 0;
- size_t Failures = 0;
+ assert(Values && NumValues);
for (size_t f = 0; f < NumFuncs; ++f) {
for (size_t i = 0; i < NumValues; ++i) {
@@ -76,7 +96,7 @@
++Failures;
std::cout << Funcs[f].Name << "Float(" << Value1Float << ", "
<< Value2Float << "): sz=" << ResultSz
- << " llc=" << ResultLlc << std::endl;
+ << " llc=" << ResultLlc << "\n";
}
++TotalTests;
double Value1Double = Values[i];
@@ -89,11 +109,66 @@
++Failures;
std::cout << Funcs[f].Name << "Double(" << Value1Double << ", "
<< Value2Double << "): sz=" << ResultSz
- << " llc=" << ResultLlc << std::endl;
+ << " llc=" << ResultLlc << "\n";
}
}
}
}
+}
+
+void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef v4si32 (*FuncTypeVector)(v4f32, v4f32);
+ static struct {
+ const char *Name;
+ FuncTypeVector FuncVectorSz;
+ FuncTypeVector FuncVectorLlc;
+ } Funcs[] = {
+#define X(cmp) \
+ { "fcmp" STR(cmp), Subzero_fcmp##cmp##Vector, fcmp##cmp##Vector } \
+ ,
+ FCMP_TABLE
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+ const static size_t NumElementsInType = 4;
+ const static size_t MaxTestsPerFunc = 100000;
+
+ assert(Values && NumValues);
+
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ v4f32 Value1, Value2;
+ for (size_t j = 0; j < NumElementsInType; ++j) {
+ Value1[j] = Values[Index() % NumValues];
+ Value2[j] = Values[Index() % NumValues];
+ }
+ ++TotalTests;
+ v4si32 ResultSz, ResultLlc;
+ ResultSz = Funcs[f].FuncVectorSz(Value1, Value2);
+ ResultLlc = Funcs[f].FuncVectorLlc(Value1, Value2);
+ if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << Funcs[f].Name << "Vector(" << vectAsString<v4f32>(Value1)
+ << ", " << vectAsString<v4f32>(Value2)
+ << "): sz=" << vectAsString<v4si32>(ResultSz)
+ << " llc=" << vectAsString<v4si32>(ResultLlc) << "\n";
+ }
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ size_t TotalTests = 0;
+ size_t Passes = 0;
+ size_t Failures = 0;
+
+ initializeValues();
+
+ testsScalar(TotalTests, Passes, Failures);
+ testsVector(TotalTests, Passes, Failures);
std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
<< " Failures=" << Failures << "\n";
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 93a872c..1698dfe 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -36,6 +36,18 @@
const size_t InstX8632BrAttributesSize =
llvm::array_lengthof(InstX8632BrAttributes);
+const struct InstX8632CmppsAttributes_ {
+ const char *EmitString;
+} InstX8632CmppsAttributes[] = {
+#define X(tag, emit) \
+ { emit } \
+ ,
+ ICEINSTX8632CMPPS_TABLE
+#undef X
+ };
+const size_t InstX8632CmppsAttributesSize =
+ llvm::array_lengthof(InstX8632CmppsAttributes);
+
const struct TypeX8632Attributes_ {
const char *CvtString; // i (integer), s (single FP), d (double FP)
const char *SdSsString; // ss, sd, or <blank>
@@ -149,6 +161,13 @@
addSource(Source);
}
+InstX8632Cmpps::InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
+ InstX8632Cmpps::CmppsCond Condition)
+ : InstX8632(Func, InstX8632::Cmpps, 2, Dest), Condition(Condition) {
+ addSource(Dest);
+ addSource(Source);
+}
+
InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
Variable *Eax, Variable *Desired,
bool Locked)
@@ -695,6 +714,28 @@
dumpSources(Func);
}
+void InstX8632Cmpps::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 2);
+ assert(Condition < InstX8632CmppsAttributesSize);
+ Str << "\t";
+ Str << "cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+ << "\t";
+ getDest()->emit(Func);
+ Str << ", ";
+ getSrc(1)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Cmpps::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ assert(Condition < InstX8632CmppsAttributesSize);
+ dumpDest(Func);
+ Str << " = cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+ << "\t";
+ dumpSources(Func);
+}
+
void InstX8632Cmpxchg::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 3);
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index be7aeb5..ece6a0a 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,6 +66,18 @@
X(Br_p, "p", "jp") \
//#define X(tag, dump, emit)
+#define ICEINSTX8632CMPPS_TABLE \
+ /* enum value, emit */ \
+ X(Cmpps_eq, "eq") \
+ X(Cmpps_lt, "lt") \
+ X(Cmpps_le, "le") \
+ X(Cmpps_unord, "unord") \
+ X(Cmpps_neq, "neq") \
+ X(Cmpps_nlt, "nlt") \
+ X(Cmpps_nle, "nle") \
+ X(Cmpps_ord, "ord") \
+//#define X(tag, emit)
+
#define ICETYPEX8632_TABLE \
/* tag, element type, cvt, sdss, pack, width */ \
X(IceType_void, IceType_void, "?" , "" , "" , "???") \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index ce1cc65..6760057 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -145,6 +145,7 @@
Call,
Cdq,
Cmov,
+ Cmpps,
Cmpxchg,
Cmpxchg8b,
Cvt,
@@ -714,6 +715,35 @@
BrCond Condition;
};
+// Cmpps instruction - compare packed singled-precision floating point
+// values
+class InstX8632Cmpps : public InstX8632 {
+public:
+ enum CmppsCond {
+#define X(tag, emit) tag,
+ ICEINSTX8632CMPPS_TABLE
+#undef X
+ Cmpps_Invalid
+ };
+
+ static InstX8632Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
+ CmppsCond Condition) {
+ return new (Func->allocate<InstX8632Cmpps>())
+ InstX8632Cmpps(Func, Dest, Source, Condition);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpps); }
+
+private:
+ InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
+ InstX8632Cmpps(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+ InstX8632Cmpps &operator=(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Cmpps() {}
+
+ CmppsCond Condition;
+};
+
// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
// equals eax. If so, the ZF is set and <desired> is stored in <dest>.
// If not, ZF is cleared and <dest> is copied to eax (or subregister).
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index c8cf170..4a719d4 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -27,26 +27,38 @@
namespace {
-// The following table summarizes the logic for lowering the fcmp instruction.
-// There is one table entry for each of the 16 conditions. A comment in
-// lowerFcmp() describes the lowering template. In the most general case, there
-// is a compare followed by two conditional branches, because some fcmp
-// conditions don't map to a single x86 conditional branch. However, in many
-// cases it is possible to swap the operands in the comparison and have a single
-// conditional branch. Since it's quite tedious to validate the table by hand,
-// good execution tests are helpful.
-
+// The following table summarizes the logic for lowering the fcmp
+// instruction. There is one table entry for each of the 16 conditions.
+//
+// The first four columns describe the case when the operands are
+// floating point scalar values. A comment in lowerFcmp() describes the
+// lowering template. In the most general case, there is a compare
+// followed by two conditional branches, because some fcmp conditions
+// don't map to a single x86 conditional branch. However, in many cases
+// it is possible to swap the operands in the comparison and have a
+// single conditional branch. Since it's quite tedious to validate the
+// table by hand, good execution tests are helpful.
+//
+// The last two columns describe the case when the operands are vectors
+// of floating point values. For most fcmp conditions, there is a clear
+// mapping to a single x86 cmpps instruction variant. Some fcmp
+// conditions require special code to handle and these are marked in the
+// table with a Cmpps_Invalid predicate.
const struct TableFcmp_ {
uint32_t Default;
- bool SwapOperands;
+ bool SwapScalarOperands;
InstX8632::BrCond C1, C2;
+ bool SwapVectorOperands;
+ InstX8632Cmpps::CmppsCond Predicate;
} TableFcmp[] = {
-#define X(val, dflt, swap, C1, C2) \
- { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \
+#define X(val, dflt, swapS, C1, C2, swapV, pred) \
+ { \
+ dflt, swapS, InstX8632Br::C1, InstX8632Br::C2, swapV, InstX8632Cmpps::pred \
+ } \
,
- FCMPX8632_TABLE
+ FCMPX8632_TABLE
#undef X
- };
+};
const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
// The following table summarizes the logic for lowering the icmp instruction
@@ -138,7 +150,7 @@
// Define a temporary set of enum values based on low-level
// table entries.
enum _tmp_enum {
-#define X(val, dflt, swap, C1, C2) _tmp_##val,
+#define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
FCMPX8632_TABLE
#undef X
_num
@@ -149,7 +161,7 @@
#undef X
// Define a set of constants based on low-level table entries,
// and ensure the table entry keys are consistent.
-#define X(val, dflt, swap, C1, C2) \
+#define X(val, dflt, swapS, C1, C2, swapV, pred) \
static const int _table2_##val = _tmp_##val; \
STATIC_ASSERT(_table1_##val == _table2_##val);
FCMPX8632_TABLE;
@@ -2213,6 +2225,68 @@
Operand *Src0 = Inst->getSrc(0);
Operand *Src1 = Inst->getSrc(1);
Variable *Dest = Inst->getDest();
+
+ if (isVectorType(Dest->getType())) {
+ InstFcmp::FCond Condition = Inst->getCondition();
+ size_t Index = static_cast<size_t>(Condition);
+ assert(Index < TableFcmpSize);
+
+ if (TableFcmp[Index].SwapVectorOperands) {
+ Operand *T = Src0;
+ Src0 = Src1;
+ Src1 = T;
+ }
+
+ Variable *T = NULL;
+
+ // ALIGNHACK: Without support for stack alignment, both operands to
+ // cmpps need to be forced into registers. Once support for stack
+ // alignment is implemented, remove LEGAL_HACK.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+ switch (Condition) {
+ default: {
+ InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
+ assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
+ T = makeReg(Src0->getType());
+ _movp(T, Src0);
+ _cmpps(T, LEGAL_HACK(Src1), Predicate);
+ } break;
+ case InstFcmp::False:
+ T = makeVectorOfZeros(Src0->getType());
+ break;
+ case InstFcmp::One: {
+ // Check both unequal and ordered.
+ T = makeReg(Src0->getType());
+ Variable *T2 = makeReg(Src0->getType());
+ Src1 = LEGAL_HACK(Src1);
+ _movp(T, Src0);
+ _cmpps(T, Src1, InstX8632Cmpps::Cmpps_neq);
+ _movp(T2, Src0);
+ _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_ord);
+ _pand(T, T2);
+ } break;
+ case InstFcmp::Ueq: {
+ // Check both equal or unordered.
+ T = makeReg(Src0->getType());
+ Variable *T2 = makeReg(Src0->getType());
+ Src1 = LEGAL_HACK(Src1);
+ _movp(T, Src0);
+ _cmpps(T, Src1, InstX8632Cmpps::Cmpps_eq);
+ _movp(T2, Src0);
+ _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_unord);
+ _por(T, T2);
+ } break;
+ case InstFcmp::True:
+ T = makeVectorOfMinusOnes(IceType_v4i32);
+ break;
+ }
+#undef LEGAL_HACK
+
+ _movp(Dest, T);
+ eliminateNextVectorSextInstruction(Dest);
+ return;
+ }
+
// Lowering a = fcmp cond, b, c
// ucomiss b, c /* only if C1 != Br_None */
// /* but swap b,c order if SwapOperands==true */
@@ -2225,7 +2299,7 @@
InstFcmp::FCond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition);
assert(Index < TableFcmpSize);
- if (TableFcmp[Index].SwapOperands) {
+ if (TableFcmp[Index].SwapScalarOperands) {
Operand *Tmp = Src0;
Src0 = Src1;
Src1 = Tmp;
@@ -2356,26 +2430,7 @@
#undef LEGAL_HACK
_movp(Dest, T);
-
- // The following pattern occurs often in lowered C and C++ code:
- //
- // %cmp = icmp pred <n x ty> %src0, %src1
- // %cmp.ext = sext <n x i1> %cmp to <n x ty>
- //
- // We can avoid the sext operation by copying the result from pcmpgt
- // and pcmpeq, which is already sign extended, to the result of the
- // sext operation
- if (InstCast *NextCast =
- llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
- if (NextCast->getCastKind() == InstCast::Sext &&
- NextCast->getSrc(0) == Dest) {
- _movp(NextCast->getDest(), T);
- // Skip over the instruction.
- NextCast->setDeleted();
- Context.advanceNext();
- }
- }
-
+ eliminateNextVectorSextInstruction(Dest);
return;
}
@@ -3544,6 +3599,28 @@
_br(Inst->getLabelDefault());
}
+// The following pattern occurs often in lowered C and C++ code:
+//
+// %cmp = fcmp/icmp pred <n x ty> %src0, %src1
+// %cmp.ext = sext <n x i1> %cmp to <n x ty>
+//
+// We can eliminate the sext operation by copying the result of pcmpeqd,
+// pcmpgtd, or cmpps (which produce sign extended results) to the result
+// of the sext operation.
+void
+TargetX8632::eliminateNextVectorSextInstruction(Variable *SignExtendedResult) {
+ if (InstCast *NextCast =
+ llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+ if (NextCast->getCastKind() == InstCast::Sext &&
+ NextCast->getSrc(0) == SignExtendedResult) {
+ _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
+ // Skip over the instruction.
+ NextCast->setDeleted();
+ Context.advanceNext();
+ }
+ }
+}
+
void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
const SizeT MaxSrcs = 0;
Variable *Dest = NULL;
diff --git a/src/IceTargetLoweringX8632.def b/src/IceTargetLoweringX8632.def
index b88091a..5dc1d13 100644
--- a/src/IceTargetLoweringX8632.def
+++ b/src/IceTargetLoweringX8632.def
@@ -15,25 +15,26 @@
#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
#define SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
-#define FCMPX8632_TABLE \
- /* val, dflt, swap, C1, C2 */ \
- X(False, 0, 0, Br_None, Br_None) \
- X(Oeq, 0, 0, Br_ne, Br_p) \
- X(Ogt, 1, 0, Br_a, Br_None) \
- X(Oge, 1, 0, Br_ae, Br_None) \
- X(Olt, 1, 1, Br_a, Br_None) \
- X(Ole, 1, 1, Br_ae, Br_None) \
- X(One, 1, 0, Br_ne, Br_None) \
- X(Ord, 1, 0, Br_np, Br_None) \
- X(Ueq, 1, 0, Br_e, Br_None) \
- X(Ugt, 1, 1, Br_b, Br_None) \
- X(Uge, 1, 1, Br_be, Br_None) \
- X(Ult, 1, 0, Br_b, Br_None) \
- X(Ule, 1, 0, Br_be, Br_None) \
- X(Une, 1, 0, Br_ne, Br_p) \
- X(Uno, 1, 0, Br_p, Br_None) \
- X(True, 1, 0, Br_None, Br_None) \
-//#define X(val, dflt, swap, C1, C2)
+#define FCMPX8632_TABLE \
+ /* <---- scalar comparison ----> <- vector comparison -> */ \
+ /* val, dflt, swap, C1, C2, swap, predicate */ \
+ X(False, 0, 0, Br_None, Br_None, 0, Cmpps_Invalid) \
+ X(Oeq, 0, 0, Br_ne, Br_p, 0, Cmpps_eq) \
+ X(Ogt, 1, 0, Br_a, Br_None, 1, Cmpps_lt) \
+ X(Oge, 1, 0, Br_ae, Br_None, 1, Cmpps_le) \
+ X(Olt, 1, 1, Br_a, Br_None, 0, Cmpps_lt) \
+ X(Ole, 1, 1, Br_ae, Br_None, 0, Cmpps_le) \
+ X(One, 1, 0, Br_ne, Br_None, 0, Cmpps_Invalid) \
+ X(Ord, 1, 0, Br_np, Br_None, 0, Cmpps_ord) \
+ X(Ueq, 1, 0, Br_e, Br_None, 0, Cmpps_Invalid) \
+ X(Ugt, 1, 1, Br_b, Br_None, 0, Cmpps_nle) \
+ X(Uge, 1, 1, Br_be, Br_None, 0, Cmpps_nlt) \
+ X(Ult, 1, 0, Br_b, Br_None, 1, Cmpps_nle) \
+ X(Ule, 1, 0, Br_be, Br_None, 1, Cmpps_nlt) \
+ X(Une, 1, 0, Br_ne, Br_p, 0, Cmpps_neq) \
+ X(Uno, 1, 0, Br_p, Br_None, 0, Cmpps_unord) \
+ X(True, 1, 0, Br_None, Br_None, 0, Cmpps_Invalid) \
+//#define X(val, dflt, swapS, C1, C2, swapV, pred)
#define ICMPX8632_TABLE \
/* val, C_32, C1_64, C2_64, C3_64 */ \
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index cbc98ce..6f09a90 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -108,6 +108,8 @@
void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
Variable *Dest, Operand *Ptr, Operand *Val);
+ void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
+
// Operand legalization helpers. To deal with address mode
// constraints, the helpers will create a new Operand and emit
// instructions that guarantee that the Operand kind is one of those
@@ -212,6 +214,10 @@
void _cmp(Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
}
+ void _cmpps(Variable *Dest, Operand *Src0,
+ InstX8632Cmpps::CmppsCond Condition) {
+ Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
+ }
void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
bool Locked) {
Context.insert(
diff --git a/tests_lit/llvm2ice_tests/vector-fcmp.ll b/tests_lit/llvm2ice_tests/vector-fcmp.ll
new file mode 100644
index 0000000..7fb717a
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-fcmp.ll
@@ -0,0 +1,178 @@
+; This file checks support for comparing vector values with the fcmp
+; instruction.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN: | FileCheck --check-prefix=DUMP %s
+
+; Check that sext elimination occurs when the result of the comparison
+; instruction is alrady sign extended. Sign extension to 4 x i32 uses
+; the pslld instruction.
+define <4 x i32> @sextElimination(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp oeq <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: sextElimination:
+; CHECK: cmpeqps
+; CHECK-NOT: pslld
+}
+
+define <4 x i32> @fcmpFalseVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp false <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpFalseVector:
+; CHECK: pxor
+}
+
+define <4 x i32> @fcmpOeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp oeq <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOeqVector:
+; CHECK: cmpeqps
+}
+
+define <4 x i32> @fcmpOgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp oge <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgeVector:
+; CHECK: cmpleps
+}
+
+define <4 x i32> @fcmpOgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ogt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOgtVector:
+; CHECK: cmpltps
+}
+
+define <4 x i32> @fcmpOleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ole <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOleVector:
+; CHECK: cmpleps
+}
+
+define <4 x i32> @fcmpOltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp olt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOltVector:
+; CHECK: cmpltps
+}
+
+define <4 x i32> @fcmpOneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp one <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOneVector:
+; CHECK: cmpneqps
+; CHECK: cmpordps
+; CHECK: pand
+}
+
+define <4 x i32> @fcmpOrdVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ord <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpOrdVector:
+; CHECK: cmpordps
+}
+
+define <4 x i32> @fcmpTrueVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp true <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpTrueVector:
+; CHECK: pcmpeqd
+}
+
+define <4 x i32> @fcmpUeqVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ueq <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUeqVector:
+; CHECK: cmpeqps
+; CHECK: cmpunordps
+; CHECK: por
+}
+
+define <4 x i32> @fcmpUgeVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp uge <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgeVector:
+; CHECK: cmpnltps
+}
+
+define <4 x i32> @fcmpUgtVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ugt <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUgtVector:
+; CHECK: cmpnleps
+}
+
+define <4 x i32> @fcmpUleVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ule <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUleVector:
+; CHECK: cmpnltps
+}
+
+define <4 x i32> @fcmpUltVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp ult <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUltVector:
+; CHECK: cmpnleps
+}
+
+define <4 x i32> @fcmpUneVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp une <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUneVector:
+; CHECK: cmpneqps
+}
+
+define <4 x i32> @fcmpUnoVector(<4 x float> %a, <4 x float> %b) {
+entry:
+ %res.trunc = fcmp uno <4 x float> %a, %b
+ %res = sext <4 x i1> %res.trunc to <4 x i32>
+ ret <4 x i32> %res
+; CHECK-LABEL: fcmpUnoVector:
+; CHECK: cmpunordps
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
+