Lower the fcmp instruction for <4 x float> operands.

Most fcmp conditions map directly to single x86 instructions. For
these, the lowering is table driven.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/413053002
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index c8cf170..4a719d4 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -27,26 +27,38 @@
 
 namespace {
 
-// The following table summarizes the logic for lowering the fcmp instruction.
-// There is one table entry for each of the 16 conditions.  A comment in
-// lowerFcmp() describes the lowering template.  In the most general case, there
-// is a compare followed by two conditional branches, because some fcmp
-// conditions don't map to a single x86 conditional branch.  However, in many
-// cases it is possible to swap the operands in the comparison and have a single
-// conditional branch.  Since it's quite tedious to validate the table by hand,
-// good execution tests are helpful.
-
+// The following table summarizes the logic for lowering the fcmp
+// instruction.  There is one table entry for each of the 16 conditions.
+//
+// The first four columns describe the case when the operands are
+// floating point scalar values.  A comment in lowerFcmp() describes the
+// lowering template.  In the most general case, there is a compare
+// followed by two conditional branches, because some fcmp conditions
+// don't map to a single x86 conditional branch.  However, in many cases
+// it is possible to swap the operands in the comparison and have a
+// single conditional branch.  Since it's quite tedious to validate the
+// table by hand, good execution tests are helpful.
+//
+// The last two columns describe the case when the operands are vectors
+// of floating point values.  For most fcmp conditions, there is a clear
+// mapping to a single x86 cmpps instruction variant.  Some fcmp
+// conditions require special code to handle and these are marked in the
+// table with a Cmpps_Invalid predicate.
 const struct TableFcmp_ {
   uint32_t Default;
-  bool SwapOperands;
+  bool SwapScalarOperands;
   InstX8632::BrCond C1, C2;
+  bool SwapVectorOperands;
+  InstX8632Cmpps::CmppsCond Predicate;
 } TableFcmp[] = {
-#define X(val, dflt, swap, C1, C2)                                             \
-  { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 }                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
+  {                                                                            \
+    dflt, swapS, InstX8632Br::C1, InstX8632Br::C2, swapV, InstX8632Cmpps::pred \
+  }                                                                            \
   ,
-    FCMPX8632_TABLE
+      FCMPX8632_TABLE
 #undef X
-  };
+};
 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
 
 // The following table summarizes the logic for lowering the icmp instruction
@@ -138,7 +150,7 @@
     // Define a temporary set of enum values based on low-level
     // table entries.
     enum _tmp_enum {
-#define X(val, dflt, swap, C1, C2) _tmp_##val,
+#define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
       FCMPX8632_TABLE
 #undef X
           _num
@@ -149,7 +161,7 @@
 #undef X
 // Define a set of constants based on low-level table entries,
 // and ensure the table entry keys are consistent.
-#define X(val, dflt, swap, C1, C2)                                             \
+#define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
   static const int _table2_##val = _tmp_##val;                                 \
   STATIC_ASSERT(_table1_##val == _table2_##val);
     FCMPX8632_TABLE;
@@ -2213,6 +2225,68 @@
   Operand *Src0 = Inst->getSrc(0);
   Operand *Src1 = Inst->getSrc(1);
   Variable *Dest = Inst->getDest();
+
+  if (isVectorType(Dest->getType())) {
+    InstFcmp::FCond Condition = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Condition);
+    assert(Index < TableFcmpSize);
+
+    if (TableFcmp[Index].SwapVectorOperands) {
+      Operand *T = Src0;
+      Src0 = Src1;
+      Src1 = T;
+    }
+
+    Variable *T = NULL;
+
+    // ALIGNHACK: Without support for stack alignment, both operands to
+    // cmpps need to be forced into registers.  Once support for stack
+    // alignment is implemented, remove LEGAL_HACK.
+#define LEGAL_HACK(Vect) legalizeToVar((Vect))
+    switch (Condition) {
+    default: {
+      InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate;
+      assert(Predicate != InstX8632Cmpps::Cmpps_Invalid);
+      T = makeReg(Src0->getType());
+      _movp(T, Src0);
+      _cmpps(T, LEGAL_HACK(Src1), Predicate);
+    } break;
+    case InstFcmp::False:
+      T = makeVectorOfZeros(Src0->getType());
+      break;
+    case InstFcmp::One: {
+      // Check both unequal and ordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_neq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_ord);
+      _pand(T, T2);
+    } break;
+    case InstFcmp::Ueq: {
+      // Check both equal or unordered.
+      T = makeReg(Src0->getType());
+      Variable *T2 = makeReg(Src0->getType());
+      Src1 = LEGAL_HACK(Src1);
+      _movp(T, Src0);
+      _cmpps(T, Src1, InstX8632Cmpps::Cmpps_eq);
+      _movp(T2, Src0);
+      _cmpps(T2, Src1, InstX8632Cmpps::Cmpps_unord);
+      _por(T, T2);
+    } break;
+    case InstFcmp::True:
+      T = makeVectorOfMinusOnes(IceType_v4i32);
+      break;
+    }
+#undef LEGAL_HACK
+
+    _movp(Dest, T);
+    eliminateNextVectorSextInstruction(Dest);
+    return;
+  }
+
   // Lowering a = fcmp cond, b, c
   //   ucomiss b, c       /* only if C1 != Br_None */
   //                      /* but swap b,c order if SwapOperands==true */
@@ -2225,7 +2299,7 @@
   InstFcmp::FCond Condition = Inst->getCondition();
   size_t Index = static_cast<size_t>(Condition);
   assert(Index < TableFcmpSize);
-  if (TableFcmp[Index].SwapOperands) {
+  if (TableFcmp[Index].SwapScalarOperands) {
     Operand *Tmp = Src0;
     Src0 = Src1;
     Src1 = Tmp;
@@ -2356,26 +2430,7 @@
 #undef LEGAL_HACK
 
     _movp(Dest, T);
-
-    // The following pattern occurs often in lowered C and C++ code:
-    //
-    //   %cmp     = icmp pred <n x ty> %src0, %src1
-    //   %cmp.ext = sext <n x i1> %cmp to <n x ty>
-    //
-    // We can avoid the sext operation by copying the result from pcmpgt
-    // and pcmpeq, which is already sign extended, to the result of the
-    // sext operation
-    if (InstCast *NextCast =
-            llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
-      if (NextCast->getCastKind() == InstCast::Sext &&
-          NextCast->getSrc(0) == Dest) {
-        _movp(NextCast->getDest(), T);
-        // Skip over the instruction.
-        NextCast->setDeleted();
-        Context.advanceNext();
-      }
-    }
-
+    eliminateNextVectorSextInstruction(Dest);
     return;
   }
 
@@ -3544,6 +3599,28 @@
   _br(Inst->getLabelDefault());
 }
 
+// The following pattern occurs often in lowered C and C++ code:
+//
+//   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
+//   %cmp.ext = sext <n x i1> %cmp to <n x ty>
+//
+// We can eliminate the sext operation by copying the result of pcmpeqd,
+// pcmpgtd, or cmpps (which produce sign extended results) to the result
+// of the sext operation.
+void
+TargetX8632::eliminateNextVectorSextInstruction(Variable *SignExtendedResult) {
+  if (InstCast *NextCast =
+          llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
+    if (NextCast->getCastKind() == InstCast::Sext &&
+        NextCast->getSrc(0) == SignExtendedResult) {
+      _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
+      // Skip over the instruction.
+      NextCast->setDeleted();
+      Context.advanceNext();
+    }
+  }
+}
+
 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) {
   const SizeT MaxSrcs = 0;
   Variable *Dest = NULL;