Subzero: Use scalar arithmetic when no vector instruction exists. Implement scalarizeArithmetic() which extracts the components of the input vectors, performs the operation with scalar instructions, and builds the output vector component by component. Fix the lowering of sdiv and srem. These were previously emitting a wrong instruction (cdq) for i8 and i16 inputs (needing cbw, cwd). In the test_arith crosstest, mask the inputs to vector shift operations to ensure that the shifts are in range. Otherwise the Subzero output is not identical to the llc output in some (undefined) cases. BUG=none R=stichnot@chromium.org Review URL: https://codereview.chromium.org/443203003

diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index cf99e844..457f56e 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp

@@ -136,11 +136,8 @@
   addSource(CallTarget);
 }
 
-InstX8632Cdq::InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source)
-    : InstX8632(Func, InstX8632::Cdq, 1, Dest) {
-  assert(Dest->getRegNum() == TargetX8632::Reg_edx);
-  assert(llvm::isa<Variable>(Source));
-  assert(llvm::dyn_cast<Variable>(Source)->getRegNum() == TargetX8632::Reg_eax);
+InstX8632Cbwdq::InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Cbwdq, 1, Dest) {
   addSource(Source);
 }
 
@@ -721,16 +718,35 @@
   dumpSources(Func);
 }
 
-void InstX8632Cdq::emit(const Cfg *Func) const {
+void InstX8632Cbwdq::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
-  Str << "\tcdq\n";
+  Operand *Src0 = getSrc(0);
+  assert(llvm::isa<Variable>(Src0));
+  assert(llvm::cast<Variable>(Src0)->getRegNum() == TargetX8632::Reg_eax);
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_eax);
+    Str << "\tcbw\n";
+    break;
+  case IceType_i16:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
+    Str << "\tcwd\n";
+    break;
+  case IceType_i32:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
+    Str << "\tcdq\n";
+    break;
+  }
 }
 
-void InstX8632Cdq::dump(const Cfg *Func) const {
+void InstX8632Cbwdq::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << " = cdq." << getSrc(0)->getType() << " ";
+  Str << " = cbw/cwd/cdq." << getSrc(0)->getType() << " ";
   dumpSources(Func);
 }
 

diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 18f0dde..1aa7909 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h

@@ -144,7 +144,7 @@
     Bsr,
     Bswap,
     Call,
-    Cdq,
+    Cbwdq,
     Cmov,
     Cmpps,
     Cmpxchg,
@@ -689,22 +689,22 @@
   virtual ~InstX8632Shrd() {}
 };
 
-// Cdq instruction - sign-extend eax into edx
-class InstX8632Cdq : public InstX8632 {
+// Cbdwq instruction - wrapper for cbw, cwd, or cdq
+class InstX8632Cbwdq : public InstX8632 {
 public:
-  static InstX8632Cdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-    return new (Func->allocate<InstX8632Cdq>())
-        InstX8632Cdq(Func, Dest, Source);
+  static InstX8632Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Cbwdq>())
+        InstX8632Cbwdq(Func, Dest, Source);
   }
   virtual void emit(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Cdq); }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cbwdq); }
 
 private:
-  InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source);
-  InstX8632Cdq(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  InstX8632Cdq &operator=(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Cdq() {}
+  InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Cbwdq(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  InstX8632Cbwdq &operator=(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cbwdq() {}
 };
 
 // Conditional move instruction.

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 8e56a10..26d11b9 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -1296,78 +1296,18 @@
         _movp(Dest, T4);
       } else {
         assert(Dest->getType() == IceType_v16i8);
-        // Sz_mul_v16i8
-        const IceString Helper = "Sz_mul_v16i8";
-        const SizeT MaxSrcs = 2;
-        InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-        Call->addArg(Src0);
-        Call->addArg(Src1);
-        lowerCall(Call);
+        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       }
     } break;
-    case InstArithmetic::Shl: {
-      // Sz_shl_v4i32, Sz_shl_v8i16, Sz_shl_v16i8
-      const IceString Helper = "Sz_shl_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Lshr: {
-      // Sz_lshr_v4i32, Sz_lshr_v8i16, Sz_lshr_v16i8
-      const IceString Helper = "Sz_lshr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Ashr: {
-      // Sz_ashr_v4i32, Sz_ashr_v8i16, Sz_ashr_v16i8
-      const IceString Helper = "Sz_ashr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Udiv: {
-      // Sz_udiv_v4i32, Sz_udiv_v8i16, Sz_udiv_v16i8
-      const IceString Helper = "Sz_udiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Sdiv: {
-      // Sz_sdiv_v4i32, Sz_sdiv_v8i16, Sz_sdiv_v16i8
-      const IceString Helper = "Sz_sdiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Urem: {
-      // Sz_urem_v4i32, Sz_urem_v8i16, Sz_urem_v16i8
-      const IceString Helper = "Sz_urem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Srem: {
-      // Sz_srem_v4i32, Sz_srem_v8i16, Sz_srem_v16i8
-      const IceString Helper = "Sz_srem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     case InstArithmetic::Fadd: {
       Variable *T = makeReg(Dest->getType());
       _movp(T, Src0);
@@ -1392,13 +1332,9 @@
       _divps(T, LEGAL_HACK(Src1));
       _movp(Dest, T);
     } break;
-    case InstArithmetic::Frem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall("Sz_frem_v4f32", Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Frem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     }
 #undef LEGAL_HACK
   } else { // Dest->getType() is non-i64 scalar
@@ -1490,11 +1426,18 @@
       break;
     case InstArithmetic::Sdiv:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T, Src1, T_edx);
-      _mov(Dest, T);
+      if (Dest->getType() == IceType_i8) {
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        _idiv(T, Src1, T);
+        _mov(Dest, T);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T, Src1, T_edx);
+        _mov(Dest, T);
+      }
       break;
     case InstArithmetic::Urem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
@@ -1515,11 +1458,20 @@
       break;
     case InstArithmetic::Srem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+      if (Dest->getType() == IceType_i8) {
+        Variable *T_ah = makeReg(IceType_i8, Reg_ah);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        Context.insert(InstFakeDef::create(Func, T_ah));
+        _idiv(T_ah, Src1, T);
+        _mov(Dest, T_ah);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T_edx, Src1, T);
+        _mov(Dest, T_edx);
+      }
       break;
     case InstArithmetic::Fadd:
       _mov(T, Src0);
@@ -3744,6 +3696,39 @@
   _br(Inst->getLabelDefault());
 }
 
+void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                      Variable *Dest, Operand *Src0,
+                                      Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt(IceType_i32, I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
+    Variable *Op1 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy, Context.getNode());
+    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty, Context.getNode());
+    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
+    T = DestT;
+    // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
+    // infinite weight temporaries.
+  }
+
+  lowerAssign(InstAssign::create(Func, Dest, T));
+}
+
 // The following pattern occurs often in lowered C and C++ code:
 //
 //   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1

diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 0c87bee..2b189ad 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h

@@ -122,6 +122,9 @@
 
   void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
 
+  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
+                           Operand *Src0, Operand *Src1);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -220,8 +223,8 @@
   void _bswap(Variable *SrcDest) {
     Context.insert(InstX8632Bswap::create(Func, SrcDest));
   }
-  void _cdq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Cdq::create(Func, Dest, Src0));
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
   }
   void _cmov(Variable *Dest, Operand *Src0, InstX8632::BrCond Condition) {
     Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));