Subzero: Use scalar arithmetic when no vector instruction exists.

Implement scalarizeArithmetic() which extracts the components of the
input vectors, performs the operation with scalar instructions, and
builds the output vector component by component.

Fix the lowering of sdiv and srem.  These were previously emitting a
wrong instruction (cdq) for i8 and i16 inputs (needing cbw, cwd).

In the test_arith crosstest, mask the inputs to vector shift
operations to ensure that the shifts are in range.  Otherwise the
Subzero output is not identical to the llc output in some (undefined)
cases.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/443203003
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 8e56a10..26d11b9 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1296,78 +1296,18 @@
         _movp(Dest, T4);
       } else {
         assert(Dest->getType() == IceType_v16i8);
-        // Sz_mul_v16i8
-        const IceString Helper = "Sz_mul_v16i8";
-        const SizeT MaxSrcs = 2;
-        InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-        Call->addArg(Src0);
-        Call->addArg(Src1);
-        lowerCall(Call);
+        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       }
     } break;
-    case InstArithmetic::Shl: {
-      // Sz_shl_v4i32, Sz_shl_v8i16, Sz_shl_v16i8
-      const IceString Helper = "Sz_shl_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Lshr: {
-      // Sz_lshr_v4i32, Sz_lshr_v8i16, Sz_lshr_v16i8
-      const IceString Helper = "Sz_lshr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Ashr: {
-      // Sz_ashr_v4i32, Sz_ashr_v8i16, Sz_ashr_v16i8
-      const IceString Helper = "Sz_ashr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Udiv: {
-      // Sz_udiv_v4i32, Sz_udiv_v8i16, Sz_udiv_v16i8
-      const IceString Helper = "Sz_udiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Sdiv: {
-      // Sz_sdiv_v4i32, Sz_sdiv_v8i16, Sz_sdiv_v16i8
-      const IceString Helper = "Sz_sdiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Urem: {
-      // Sz_urem_v4i32, Sz_urem_v8i16, Sz_urem_v16i8
-      const IceString Helper = "Sz_urem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Srem: {
-      // Sz_srem_v4i32, Sz_srem_v8i16, Sz_srem_v16i8
-      const IceString Helper = "Sz_srem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     case InstArithmetic::Fadd: {
       Variable *T = makeReg(Dest->getType());
       _movp(T, Src0);
@@ -1392,13 +1332,9 @@
       _divps(T, LEGAL_HACK(Src1));
       _movp(Dest, T);
     } break;
-    case InstArithmetic::Frem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall("Sz_frem_v4f32", Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Frem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     }
 #undef LEGAL_HACK
   } else { // Dest->getType() is non-i64 scalar
@@ -1490,11 +1426,18 @@
       break;
     case InstArithmetic::Sdiv:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T, Src1, T_edx);
-      _mov(Dest, T);
+      if (Dest->getType() == IceType_i8) {
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        _idiv(T, Src1, T);
+        _mov(Dest, T);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T, Src1, T_edx);
+        _mov(Dest, T);
+      }
       break;
     case InstArithmetic::Urem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
@@ -1515,11 +1458,20 @@
       break;
     case InstArithmetic::Srem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+      if (Dest->getType() == IceType_i8) {
+        Variable *T_ah = makeReg(IceType_i8, Reg_ah);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        Context.insert(InstFakeDef::create(Func, T_ah));
+        _idiv(T_ah, Src1, T);
+        _mov(Dest, T_ah);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T_edx, Src1, T);
+        _mov(Dest, T_edx);
+      }
       break;
     case InstArithmetic::Fadd:
       _mov(T, Src0);
@@ -3744,6 +3696,39 @@
   _br(Inst->getLabelDefault());
 }
 
+void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                      Variable *Dest, Operand *Src0,
+                                      Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt(IceType_i32, I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
+    Variable *Op1 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy, Context.getNode());
+    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty, Context.getNode());
+    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
+    T = DestT;
+    // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
+    // infinite weight temporaries.
+  }
+
+  lowerAssign(InstAssign::create(Func, Dest, T));
+}
+
 // The following pattern occurs often in lowered C and C++ code:
 //
 //   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1