Lower the fcmp instruction for <4 x float> operands.

Most fcmp conditions map directly to single x86 instructions. For
these, the lowering is table driven.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/413053002
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 93a872c..1698dfe 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -36,6 +36,18 @@
 const size_t InstX8632BrAttributesSize =
     llvm::array_lengthof(InstX8632BrAttributes);
 
+const struct InstX8632CmppsAttributes_ {
+  const char *EmitString;
+} InstX8632CmppsAttributes[] = {
+#define X(tag, emit)                                                           \
+  { emit }                                                                     \
+  ,
+    ICEINSTX8632CMPPS_TABLE
+#undef X
+  };
+const size_t InstX8632CmppsAttributesSize =
+    llvm::array_lengthof(InstX8632CmppsAttributes);
+
 const struct TypeX8632Attributes_ {
   const char *CvtString;   // i (integer), s (single FP), d (double FP)
   const char *SdSsString;  // ss, sd, or <blank>
@@ -149,6 +161,13 @@
   addSource(Source);
 }
 
+InstX8632Cmpps::InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source,
+                               InstX8632Cmpps::CmppsCond Condition)
+    : InstX8632(Func, InstX8632::Cmpps, 2, Dest), Condition(Condition) {
+  addSource(Dest);
+  addSource(Source);
+}
+
 InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
                                    Variable *Eax, Variable *Desired,
                                    bool Locked)
@@ -695,6 +714,28 @@
   dumpSources(Func);
 }
 
+void InstX8632Cmpps::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  assert(Condition < InstX8632CmppsAttributesSize);
+  Str << "\t";
+  Str << "cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+      << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+void InstX8632Cmpps::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  assert(Condition < InstX8632CmppsAttributesSize);
+  dumpDest(Func);
+  Str << " = cmp" << InstX8632CmppsAttributes[Condition].EmitString << "ps"
+      << "\t";
+  dumpSources(Func);
+}
+
 void InstX8632Cmpxchg::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 3);
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index be7aeb5..ece6a0a 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,6 +66,18 @@
   X(Br_p,        "p",  "jp")   \
 //#define X(tag, dump, emit)
 
+#define ICEINSTX8632CMPPS_TABLE \
+  /* enum value, emit */        \
+  X(Cmpps_eq,    "eq")          \
+  X(Cmpps_lt,    "lt")          \
+  X(Cmpps_le,    "le")          \
+  X(Cmpps_unord, "unord")       \
+  X(Cmpps_neq,   "neq")         \
+  X(Cmpps_nlt,   "nlt")         \
+  X(Cmpps_nle,   "nle")         \
+  X(Cmpps_ord,   "ord")         \
+//#define X(tag, emit)
+
 #define ICETYPEX8632_TABLE                                        \
   /* tag,          element type, cvt, sdss,  pack, width */       \
   X(IceType_void,  IceType_void, "?" , ""  , "" ,  "???")         \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index ce1cc65..6760057 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -145,6 +145,7 @@
     Call,
     Cdq,
     Cmov,
+    Cmpps,
     Cmpxchg,
     Cmpxchg8b,
     Cvt,
@@ -714,6 +715,35 @@
   BrCond Condition;
 };
 
+// Cmpps instruction - compare packed singled-precision floating point
+// values
+class InstX8632Cmpps : public InstX8632 {
+public:
+  enum CmppsCond {
+#define X(tag, emit) tag,
+    ICEINSTX8632CMPPS_TABLE
+#undef X
+    Cmpps_Invalid
+  };
+
+  static InstX8632Cmpps *create(Cfg *Func, Variable *Dest, Operand *Source,
+                                CmppsCond Condition) {
+    return new (Func->allocate<InstX8632Cmpps>())
+        InstX8632Cmpps(Func, Dest, Source, Condition);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpps); }
+
+private:
+  InstX8632Cmpps(Cfg *Func, Variable *Dest, Operand *Source, CmppsCond Cond);
+  InstX8632Cmpps(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+  InstX8632Cmpps &operator=(const InstX8632Cmpps &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cmpps() {}
+
+  CmppsCond Condition;
+};
+
 // Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
 // equals eax. If so, the ZF is set and <desired> is stored in <dest>.
 // If not, ZF is cleared and <dest> is copied to eax (or subregister).
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index c8cf170..4a719d4 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -27,26 +27,38 @@
 
 namespace {
 
-// The following table summarizes the logic for lowering the fcmp instruction.
-// There is one table entry for each of the 16 conditions.  A comment in
-// lowerFcmp() describes the lowering template.  In the most general case, there
-// is a compare followed by two conditional branches, because some fcmp
-// conditions don't map to a single x86 conditional branch.  However, in many
-// cases it is possible to swap the operands in the comparison and have a single
-// conditional branch.  Since it's quite tedious to validate the table by hand,
-// good execution tests are helpful.
-
+// The following table summarizes the logic for lowering the fcmp
+// instruction.  There is one table entry for each of the 16 conditions.
+//
+// The first four columns describe the case when the operands are
+// floating point scalar values.  A comment in lowerFcmp() describes the
+// lowering template.  In the most general case, there is a compare
+// followed by two conditional branches, because some fcmp conditions
+// don't map to a single x86 conditional branch.  However, in many cases
+// it is possible to swap the operands in the comparison and have a
+// single conditional branch.  Since it's quite tedious to validate the
+// table by hand, good execution tests are helpful.
+//
+// The last two columns describe the case when the operands are vectors
+// of floating point values.  For most fcmp conditions, there is a clear
+// mapping to a single x86 cmpps instruction variant.  Some fcmp
+// conditions require special code to handle and these are marked in the
+// table with a Cmpps_Invalid predicate.
 const struct TableFcmp_ {
   uint32_t Default;
-  bool SwapOperands;
+  bool SwapScalarOperands;
   InstX8632::BrCond C1, C2;
+  bool SwapVectorOperands;
+  InstX8632Cmpps::CmppsCond Predicate;
 } TableFcmp[] = {
-#define X(val, dflt, swap, C1, C2)                                             \
-  { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 }                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
+  {                                                                            \
+    dflt, swapS, InstX8632Br::C1, InstX8632Br::C2, swapV, InstX8632Cmpps::pred \
+  }                                                                            \
   ,
-    FCMPX8632_TABLE
+      FCMPX8632_TABLE
 #undef X
-  };
+};
 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
 
 // The following table summarizes the logic for lowering the icmp instruction
@@ -138,7 +150,7 @@
     // Define a temporary set of enum values based on low-level
     // table entries.
     enum _tmp_enum {
-#define X(val, dflt, swap, C1, C2) _tmp_##val,
+#define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
       FCMPX8632_TABLE
 #undef X
           _num
@@ -149,7 +161,7 @@
 #undef X
 // Define a set of constants based on low-level table entries,
 // and ensure the table entry keys are consistent.
-#define X(val, dflt, swap, C1, C2)                                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
   static const int _table2_##val = _tmp_##val;                                 \
   STATIC_ASSERT(_table1_##val == _table2_##val);
     FCMPX8632_TABLE;
@@ -2213,6 +2225,68 @@
   Operand *Src0 = Inst->getSrc(0);
   Operand *Src1 = Inst->getSrc(1);
   Variable *Dest = Inst->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    InstFcmp::FCond Condition = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Condition);
+    assert(Index < TableFcmpSize);
+
+    if (TableFcmp[Index].SwapVectorOperands) {
+      Operand *T = Src0;
+      Src0 = Src1;
+      Src1 = T;
+    }
+
+    Variable *T = NULL;
+
+    // ALIGNHACK: Without support for stack alignment, both operands to
+    // cmpps need to be forced into registers.  Once support for stack
+    // alignment is implemented, remove LEGAL_HACK.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    switch (Condition) {
+    default: {
+      InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
+      assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
+      T = makeReg(Src0->getType());
+      _movp(T, Src0);
+      _cmpps(T, LEGAL_HACK(Src1), Predicate);
+    } break;
+    case InstFcmp::False:
+      T = makeVectorOfZeros(Src0->getType());
+      break;
+    case InstFcmp::One: {
+      // Check both unequal and ordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_neq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_ord);
+      _pand(T, T2);
+    } break;
+    case InstFcmp::Ueq: {
+      // Check both equal or unordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_eq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_unord);
+      _por(T, T2);
+    } break;
+    case InstFcmp::True:
+      T = makeVectorOfMinusOnes(IceType_v4i32);
+      break;
+    }
+#undef LEGAL_HACK
+
+    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
+    return;
+  }
+
   // Lowering a = fcmp cond, b, c
   //   ucomiss b, c       /* only if C1 != Br_None */
   //                      /* but swap b,c order if SwapOperands==true */
@@ -2225,7 +2299,7 @@
   InstFcmp::FCond Condition = Inst->getCondition();
   size_t Index = static_cast<size_t>(Condition);
   assert(Index < TableFcmpSize);
-  if (TableFcmp[Index].SwapOperands) {
+  if (TableFcmp[Index].SwapScalarOperands) {
     Operand *Tmp = Src0;
     Src0 = Src1;
     Src1 = Tmp;
@@ -2356,26 +2430,7 @@
 #undef LEGAL_HACK
 
     _movp(Dest, T);
-
-    // The following pattern occurs often in lowered C and C++ code:
-    //
-    //   %cmp     = icmp pred <n x ty> %src0, %src1
-    //   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-    //
-    // We can avoid the sext operation by copying the result from pcmpgt
-    // and pcmpeq, which is already sign extended, to the result of the
-    // sext operation
-    if (InstCast *NextCast =
-            llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-      if (NextCast->getCastKind() == InstCast::Sext &&
-          NextCast->getSrc(0) == Dest) {
-        _movp(NextCast->getDest(), T);
-        // Skip over the instruction.
-        NextCast->setDeleted();
-        Context.advanceNext();
-      }
-    }
-
+    eliminateNextVectorSextInstruction(Dest);
     return;
   }
 
@@ -3544,6 +3599,28 @@
   _br(Inst->getLabelDefault());
 }
 
+// The following pattern occurs often in lowered C and C++ code:
+//
+//   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+//   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+//
+// We can eliminate the sext operation by copying the result of pcmpeqd,
+// pcmpgtd, or cmpps (which produce sign extended results) to the result
+// of the sext operation.
+void
+TargetX8632::eliminateNextVectorSextInstruction(Variable *SignExtendedResult) {
+  if (InstCast *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
+      // Skip over the instruction.
+      NextCast->setDeleted();
+      Context.advanceNext();
+    }
+  }
+}
+
 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
   const SizeT MaxSrcs = 0;
   Variable *Dest = NULL;
diff --git a/src/IceTargetLoweringX8632.def b/src/IceTargetLoweringX8632.def
index b88091a..5dc1d13 100644
--- a/src/IceTargetLoweringX8632.def
+++ b/src/IceTargetLoweringX8632.def
@@ -15,25 +15,26 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632_DEF
 
-#define FCMPX8632_TABLE                  \
-  /* val,  dflt, swap, C1,      C2 */    \
-  X(False, 0,    0,    Br_None, Br_None) \
-  X(Oeq,   0,    0,    Br_ne,   Br_p)    \
-  X(Ogt,   1,    0,    Br_a,    Br_None) \
-  X(Oge,   1,    0,    Br_ae,   Br_None) \
-  X(Olt,   1,    1,    Br_a,    Br_None) \
-  X(Ole,   1,    1,    Br_ae,   Br_None) \
-  X(One,   1,    0,    Br_ne,   Br_None) \
-  X(Ord,   1,    0,    Br_np,   Br_None) \
-  X(Ueq,   1,    0,    Br_e,    Br_None) \
-  X(Ugt,   1,    1,    Br_b,    Br_None) \
-  X(Uge,   1,    1,    Br_be,   Br_None) \
-  X(Ult,   1,    0,    Br_b,    Br_None) \
-  X(Ule,   1,    0,    Br_be,   Br_None) \
-  X(Une,   1,    0,    Br_ne,   Br_p)    \
-  X(Uno,   1,    0,    Br_p,    Br_None) \
-  X(True,  1,    0,    Br_None, Br_None) \
-//#define X(val, dflt, swap, C1, C2)
+#define FCMPX8632_TABLE                                              \
+  /*       <---- scalar comparison ---->  <- vector comparison -> */ \
+  /* val,  dflt, swap, C1,      C2,       swap,  predicate        */ \
+  X(False, 0,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
+  X(Oeq,   0,    0,    Br_ne,   Br_p,     0,     Cmpps_eq)           \
+  X(Ogt,   1,    0,    Br_a,    Br_None,  1,     Cmpps_lt)           \
+  X(Oge,   1,    0,    Br_ae,   Br_None,  1,     Cmpps_le)           \
+  X(Olt,   1,    1,    Br_a,    Br_None,  0,     Cmpps_lt)           \
+  X(Ole,   1,    1,    Br_ae,   Br_None,  0,     Cmpps_le)           \
+  X(One,   1,    0,    Br_ne,   Br_None,  0,     Cmpps_Invalid)      \
+  X(Ord,   1,    0,    Br_np,   Br_None,  0,     Cmpps_ord)          \
+  X(Ueq,   1,    0,    Br_e,    Br_None,  0,     Cmpps_Invalid)      \
+  X(Ugt,   1,    1,    Br_b,    Br_None,  0,     Cmpps_nle)          \
+  X(Uge,   1,    1,    Br_be,   Br_None,  0,     Cmpps_nlt)          \
+  X(Ult,   1,    0,    Br_b,    Br_None,  1,     Cmpps_nle)          \
+  X(Ule,   1,    0,    Br_be,   Br_None,  1,     Cmpps_nlt)          \
+  X(Une,   1,    0,    Br_ne,   Br_p,     0,     Cmpps_neq)          \
+  X(Uno,   1,    0,    Br_p,    Br_None,  0,     Cmpps_unord)        \
+  X(True,  1,    0,    Br_None, Br_None,  0,     Cmpps_Invalid)      \
+//#define X(val, dflt, swapS, C1, C2, swapV, pred)
 
 #define ICMPX8632_TABLE                     \
   /* val, C_32,  C1_64,   C2_64,   C3_64 */ \
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index cbc98ce..6f09a90 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -108,6 +108,8 @@
   void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
                                 Variable *Dest, Operand *Ptr, Operand *Val);
 
+  void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -212,6 +214,10 @@
   void _cmp(Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
   }
+  void _cmpps(Variable *Dest, Operand *Src0,
+              InstX8632Cmpps::CmppsCond Condition) {
+    Context.insert(InstX8632Cmpps::create(Func, Dest, Src0, Condition));
+  }
   void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
                 bool Locked) {
     Context.insert(