Subzero: Use scalar arithmetic when no vector instruction exists.

Implement scalarizeArithmetic() which extracts the components of the
input vectors, performs the operation with scalar instructions, and
builds the output vector component by component.

Fix the lowering of sdiv and srem.  These were previously emitting a
wrong instruction (cdq) for i8 and i16 inputs (needing cbw, cwd).

In the test_arith crosstest, mask the inputs to vector shift
operations to ensure that the shifts are in range.  Otherwise the
Subzero output is not identical to the llc output in some (undefined)
cases.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/443203003
diff --git a/crosstest/test_arith.cpp b/crosstest/test_arith.cpp
index 6f8aec4..446ea04 100644
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -18,7 +18,7 @@
 
 #include "test_arith.h"
 
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   bool test##inst(bool a, bool b) { return a op b; }                           \
   uint8_t test##inst(uint8_t a, uint8_t b) { return a op b; }                  \
   uint16_t test##inst(uint16_t a, uint16_t b) { return a op b; }               \
@@ -30,7 +30,7 @@
 UINTOP_TABLE
 #undef X
 
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   bool test##inst(bool a, bool b) { return a op b; }                           \
   myint8_t test##inst(myint8_t a, myint8_t b) { return a op b; }               \
   int16_t test##inst(int16_t a, int16_t b) { return a op b; }                  \
diff --git a/crosstest/test_arith.def b/crosstest/test_arith.def
index 9ebb4fd..2033805 100644
--- a/crosstest/test_arith.def
+++ b/crosstest/test_arith.def
@@ -17,29 +17,29 @@
 #define XSTR(s) STR(s)
 #define STR(s) #s
 
-#define UINTOP_TABLE \
-  /* inst, operator, div */ \
-  X(Add,   +,        0 )    \
-  X(Sub,   -,        0 )    \
-  X(Mul,   *,        0 )    \
-  X(Udiv,  /,        1 )    \
-  X(Urem,  %,        1 )    \
-  X(Shl,   <<,       0)     \
-  X(Lshr,  >>,       0)     \
-  X(And,   &,        0 )    \
-  X(Or,    |,        0 )    \
-  X(Xor,   ^,        0 )    \
-//#define X(inst, op, isdiv)
+#define UINTOP_TABLE                 \
+  /* inst, operator, div, shift */   \
+  X(Add,   +,        0,   0)         \
+  X(Sub,   -,        0,   0)         \
+  X(Mul,   *,        0,   0)         \
+  X(Udiv,  /,        1,   0)         \
+  X(Urem,  %,        1,   0)         \
+  X(Shl,   <<,       0,   1)         \
+  X(Lshr,  >>,       0,   1)         \
+  X(And,   &,        0,   0)         \
+  X(Or,    |,        0,   0)         \
+  X(Xor,   ^,        0,   0)         \
+//#define X(inst, op, isdiv, isshift)
 
-#define SINTOP_TABLE \
-  /* inst, operator, div */ \
-  X(Sdiv,  /,        1)     \
-  X(Srem,  %,        1)     \
-  X(Ashr,  >>,       0)     \
-//#define X(inst, op, isdiv)
+#define SINTOP_TABLE                 \
+  /* inst, operator, div, shift */   \
+  X(Sdiv,  /,        1,   0)         \
+  X(Srem,  %,        1,   0)         \
+  X(Ashr,  >>,       0,   1)         \
+//#define X(inst, op, isdiv, isshift)
 
 #define COMMA ,
-#define FPOP_TABLE \
+#define FPOP_TABLE           \
   /* inst, infix_op, func */ \
   X(Fadd,  +,              ) \
   X(Fsub,  -,              ) \
diff --git a/crosstest/test_arith.h b/crosstest/test_arith.h
index c9cd965..855c607 100644
--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -17,7 +17,7 @@
 
 #include "vectors.h"
 
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   bool test##inst(bool a, bool b);                                             \
   uint8_t test##inst(uint8_t a, uint8_t b);                                    \
   uint16_t test##inst(uint16_t a, uint16_t b);                                 \
@@ -29,7 +29,7 @@
 UINTOP_TABLE
 #undef X
 
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   bool test##inst(bool a, bool b);                                             \
   myint8_t test##inst(myint8_t a, myint8_t b);                                 \
   int16_t test##inst(int16_t a, int16_t b);                                    \
diff --git a/crosstest/test_arith_main.cpp b/crosstest/test_arith_main.cpp
index b032a5f..84bc6b7 100644
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -61,12 +61,12 @@
     FuncTypeSigned FuncSzSigned;
     bool ExcludeDivExceptions; // for divide related tests
   } Funcs[] = {
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   { STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv }           \
   ,
       UINTOP_TABLE
 #undef X
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   { STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv }           \
   ,
       SINTOP_TABLE
@@ -172,17 +172,18 @@
     FuncTypeSigned FuncLlcSigned;
     FuncTypeSigned FuncSzSigned;
     bool ExcludeDivExceptions; // for divide related tests
+    bool MaskShiftOperations;  // for shift related tests
   } Funcs[] = {
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   {                                                                            \
-    STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv             \
+    STR(inst), test##inst, Subzero_::test##inst, NULL, NULL, isdiv, isshift    \
   }                                                                            \
   ,
         UINTOP_TABLE
 #undef X
-#define X(inst, op, isdiv)                                                     \
+#define X(inst, op, isdiv, isshift)                                            \
   {                                                                            \
-    STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv             \
+    STR(inst), NULL, NULL, test##inst, Subzero_::test##inst, isdiv, isshift    \
   }                                                                            \
   ,
         SINTOP_TABLE
@@ -201,6 +202,8 @@
         if (Funcs[f].ExcludeDivExceptions &&
             inputsMayTriggerException<ElementTypeSigned>(Element1, Element2))
           continue;
+        if (Funcs[f].MaskShiftOperations)
+          Element2 &= CHAR_BIT * sizeof(ElementTypeUnsigned) - 1;
         Value1[j] = Element1;
         Value2[j] = Element2;
         ++j;
@@ -360,37 +363,3 @@
   return Failures;
 }
 
-extern "C" {
-// Subzero helpers
-  v4si32 Sz_shl_v4i32(v4si32 a, v4si32 b) { return a << b; }
-  v4si32 Sz_ashr_v4i32(v4si32 a, v4si32 b) { return a >> b; }
-  v4ui32 Sz_lshr_v4i32(v4ui32 a, v4ui32 b) { return a >> b; }
-  v4si32 Sz_sdiv_v4i32(v4si32 a, v4si32 b) { return a / b; }
-  v4ui32 Sz_udiv_v4i32(v4ui32 a, v4ui32 b) { return a / b; }
-  v4si32 Sz_srem_v4i32(v4si32 a, v4si32 b) { return a % b; }
-  v4ui32 Sz_urem_v4i32(v4ui32 a, v4ui32 b) { return a % b; }
-
-  v8si16 Sz_shl_v8i16(v8si16 a, v8si16 b) { return a << b; }
-  v8si16 Sz_ashr_v8i16(v8si16 a, v8si16 b) { return a >> b; }
-  v8ui16 Sz_lshr_v8i16(v8ui16 a, v8ui16 b) { return a >> b; }
-  v8si16 Sz_sdiv_v8i16(v8si16 a, v8si16 b) { return a / b; }
-  v8ui16 Sz_udiv_v8i16(v8ui16 a, v8ui16 b) { return a / b; }
-  v8si16 Sz_srem_v8i16(v8si16 a, v8si16 b) { return a % b; }
-  v8ui16 Sz_urem_v8i16(v8ui16 a, v8ui16 b) { return a % b; }
-
-  v16ui8 Sz_mul_v16i8(v16ui8 a, v16ui8 b) { return a * b; }
-  v16si8 Sz_shl_v16i8(v16si8 a, v16si8 b) { return a << b; }
-  v16si8 Sz_ashr_v16i8(v16si8 a, v16si8 b) { return a >> b; }
-  v16ui8 Sz_lshr_v16i8(v16ui8 a, v16ui8 b) { return a >> b; }
-  v16si8 Sz_sdiv_v16i8(v16si8 a, v16si8 b) { return a / b; }
-  v16ui8 Sz_udiv_v16i8(v16ui8 a, v16ui8 b) { return a / b; }
-  v16si8 Sz_srem_v16i8(v16si8 a, v16si8 b) { return a % b; }
-  v16ui8 Sz_urem_v16i8(v16ui8 a, v16ui8 b) { return a % b; }
-
-  v4f32 Sz_frem_v4f32(v4f32 a, v4f32 b) {
-    v4f32 Result;
-    for (int i = 0; i < 4; ++i)
-      Result[i] = fmodf(a[i], b[i]);
-    return Result;
-  }
-}
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index cf99e844..457f56e 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -136,11 +136,8 @@
   addSource(CallTarget);
 }
 
-InstX8632Cdq::InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source)
-    : InstX8632(Func, InstX8632::Cdq, 1, Dest) {
-  assert(Dest->getRegNum() == TargetX8632::Reg_edx);
-  assert(llvm::isa<Variable>(Source));
-  assert(llvm::dyn_cast<Variable>(Source)->getRegNum() == TargetX8632::Reg_eax);
+InstX8632Cbwdq::InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Cbwdq, 1, Dest) {
   addSource(Source);
 }
 
@@ -721,16 +718,35 @@
   dumpSources(Func);
 }
 
-void InstX8632Cdq::emit(const Cfg *Func) const {
+void InstX8632Cbwdq::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
-  Str << "\tcdq\n";
+  Operand *Src0 = getSrc(0);
+  assert(llvm::isa<Variable>(Src0));
+  assert(llvm::cast<Variable>(Src0)->getRegNum() == TargetX8632::Reg_eax);
+  switch (Src0->getType()) {
+  default:
+    llvm_unreachable("unexpected source type!");
+    break;
+  case IceType_i8:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_eax);
+    Str << "\tcbw\n";
+    break;
+  case IceType_i16:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
+    Str << "\tcwd\n";
+    break;
+  case IceType_i32:
+    assert(getDest()->getRegNum() == TargetX8632::Reg_edx);
+    Str << "\tcdq\n";
+    break;
+  }
 }
 
-void InstX8632Cdq::dump(const Cfg *Func) const {
+void InstX8632Cbwdq::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << " = cdq." << getSrc(0)->getType() << " ";
+  Str << " = cbw/cwd/cdq." << getSrc(0)->getType() << " ";
   dumpSources(Func);
 }
 
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 18f0dde..1aa7909 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -144,7 +144,7 @@
     Bsr,
     Bswap,
     Call,
-    Cdq,
+    Cbwdq,
     Cmov,
     Cmpps,
     Cmpxchg,
@@ -689,22 +689,22 @@
   virtual ~InstX8632Shrd() {}
 };
 
-// Cdq instruction - sign-extend eax into edx
-class InstX8632Cdq : public InstX8632 {
+// Cbdwq instruction - wrapper for cbw, cwd, or cdq
+class InstX8632Cbwdq : public InstX8632 {
 public:
-  static InstX8632Cdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
-    return new (Func->allocate<InstX8632Cdq>())
-        InstX8632Cdq(Func, Dest, Source);
+  static InstX8632Cbwdq *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Cbwdq>())
+        InstX8632Cbwdq(Func, Dest, Source);
   }
   virtual void emit(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Cdq); }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cbwdq); }
 
 private:
-  InstX8632Cdq(Cfg *Func, Variable *Dest, Operand *Source);
-  InstX8632Cdq(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  InstX8632Cdq &operator=(const InstX8632Cdq &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Cdq() {}
+  InstX8632Cbwdq(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Cbwdq(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  InstX8632Cbwdq &operator=(const InstX8632Cbwdq &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Cbwdq() {}
 };
 
 // Conditional move instruction.
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 8e56a10..26d11b9 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1296,78 +1296,18 @@
         _movp(Dest, T4);
       } else {
         assert(Dest->getType() == IceType_v16i8);
-        // Sz_mul_v16i8
-        const IceString Helper = "Sz_mul_v16i8";
-        const SizeT MaxSrcs = 2;
-        InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-        Call->addArg(Src0);
-        Call->addArg(Src1);
-        lowerCall(Call);
+        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
       }
     } break;
-    case InstArithmetic::Shl: {
-      // Sz_shl_v4i32, Sz_shl_v8i16, Sz_shl_v16i8
-      const IceString Helper = "Sz_shl_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Lshr: {
-      // Sz_lshr_v4i32, Sz_lshr_v8i16, Sz_lshr_v16i8
-      const IceString Helper = "Sz_lshr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Ashr: {
-      // Sz_ashr_v4i32, Sz_ashr_v8i16, Sz_ashr_v16i8
-      const IceString Helper = "Sz_ashr_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Udiv: {
-      // Sz_udiv_v4i32, Sz_udiv_v8i16, Sz_udiv_v16i8
-      const IceString Helper = "Sz_udiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Sdiv: {
-      // Sz_sdiv_v4i32, Sz_sdiv_v8i16, Sz_sdiv_v16i8
-      const IceString Helper = "Sz_sdiv_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Urem: {
-      // Sz_urem_v4i32, Sz_urem_v8i16, Sz_urem_v16i8
-      const IceString Helper = "Sz_urem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
-    case InstArithmetic::Srem: {
-      // Sz_srem_v4i32, Sz_srem_v8i16, Sz_srem_v16i8
-      const IceString Helper = "Sz_srem_" + typeIdentString(Dest->getType());
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Shl:
+    case InstArithmetic::Lshr:
+    case InstArithmetic::Ashr:
+    case InstArithmetic::Udiv:
+    case InstArithmetic::Urem:
+    case InstArithmetic::Sdiv:
+    case InstArithmetic::Srem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     case InstArithmetic::Fadd: {
       Variable *T = makeReg(Dest->getType());
       _movp(T, Src0);
@@ -1392,13 +1332,9 @@
       _divps(T, LEGAL_HACK(Src1));
       _movp(Dest, T);
     } break;
-    case InstArithmetic::Frem: {
-      const SizeT MaxSrcs = 2;
-      InstCall *Call = makeHelperCall("Sz_frem_v4f32", Dest, MaxSrcs);
-      Call->addArg(Src0);
-      Call->addArg(Src1);
-      lowerCall(Call);
-    } break;
+    case InstArithmetic::Frem:
+      scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      break;
     }
 #undef LEGAL_HACK
   } else { // Dest->getType() is non-i64 scalar
@@ -1490,11 +1426,18 @@
       break;
     case InstArithmetic::Sdiv:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T, Src1, T_edx);
-      _mov(Dest, T);
+      if (Dest->getType() == IceType_i8) {
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        _idiv(T, Src1, T);
+        _mov(Dest, T);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T, Src1, T_edx);
+        _mov(Dest, T);
+      }
       break;
     case InstArithmetic::Urem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
@@ -1515,11 +1458,20 @@
       break;
     case InstArithmetic::Srem:
       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-      T_edx = makeReg(IceType_i32, Reg_edx);
-      _mov(T, Src0, Reg_eax);
-      _cdq(T_edx, T);
-      _idiv(T_edx, Src1, T);
-      _mov(Dest, T_edx);
+      if (Dest->getType() == IceType_i8) {
+        Variable *T_ah = makeReg(IceType_i8, Reg_ah);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T, T);
+        Context.insert(InstFakeDef::create(Func, T_ah));
+        _idiv(T_ah, Src1, T);
+        _mov(Dest, T_ah);
+      } else {
+        T_edx = makeReg(IceType_i32, Reg_edx);
+        _mov(T, Src0, Reg_eax);
+        _cbwdq(T_edx, T);
+        _idiv(T_edx, Src1, T);
+        _mov(Dest, T_edx);
+      }
       break;
     case InstArithmetic::Fadd:
       _mov(T, Src0);
@@ -3744,6 +3696,39 @@
   _br(Inst->getLabelDefault());
 }
 
+void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
+                                      Variable *Dest, Operand *Src0,
+                                      Operand *Src1) {
+  assert(isVectorType(Dest->getType()));
+  Type Ty = Dest->getType();
+  Type ElementTy = typeElementType(Ty);
+  SizeT NumElements = typeNumElements(Ty);
+
+  Operand *T = Ctx->getConstantUndef(Ty);
+  for (SizeT I = 0; I < NumElements; ++I) {
+    Constant *Index = Ctx->getConstantInt(IceType_i32, I);
+
+    // Extract the next two inputs.
+    Variable *Op0 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
+    Variable *Op1 = Func->makeVariable(ElementTy, Context.getNode());
+    lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
+
+    // Perform the arithmetic as a scalar operation.
+    Variable *Res = Func->makeVariable(ElementTy, Context.getNode());
+    lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
+
+    // Insert the result into position.
+    Variable *DestT = Func->makeVariable(Ty, Context.getNode());
+    lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
+    T = DestT;
+    // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
+    // infinite weight temporaries.
+  }
+
+  lowerAssign(InstAssign::create(Func, Dest, T));
+}
+
 // The following pattern occurs often in lowered C and C++ code:
 //
 //   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 0c87bee..2b189ad 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -122,6 +122,9 @@
 
   void eliminateNextVectorSextInstruction(Variable *SignExtendedResult);
 
+  void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
+                           Operand *Src0, Operand *Src1);
+
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -220,8 +223,8 @@
   void _bswap(Variable *SrcDest) {
     Context.insert(InstX8632Bswap::create(Func, SrcDest));
   }
-  void _cdq(Variable *Dest, Operand *Src0) {
-    Context.insert(InstX8632Cdq::create(Func, Dest, Src0));
+  void _cbwdq(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
   }
   void _cmov(Variable *Dest, Operand *Src0, InstX8632::BrCond Condition) {
     Context.insert(InstX8632Cmov::create(Func, Dest, Src0, Condition));
diff --git a/tests_lit/llvm2ice_tests/sdiv.ll b/tests_lit/llvm2ice_tests/sdiv.ll
new file mode 100644
index 0000000..ec43d6a
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/sdiv.ll
@@ -0,0 +1,82 @@
+; This checks the correctness of the lowering code for the small
+; integer variants of sdiv and srem.
+
+; RUN: %llvm2ice --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck  %s
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+define i32 @sdiv_i8(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i8
+  %b = trunc i32 %b.i32 to i8
+  %res = sdiv i8 %a, %b
+  %res.i32 = zext i8 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: sdiv_i8:
+; CHECK: cbw
+; CHECK: idiv
+}
+
+define i32 @sdiv_i16(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i16
+  %b = trunc i32 %b.i32 to i16
+  %res = sdiv i16 %a, %b
+  %res.i32 = zext i16 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: sdiv_i16:
+; CHECK: cwd
+; CHECK: idiv
+}
+
+define i32 @sdiv_i32(i32 %a, i32 %b) {
+entry:
+  %res = sdiv i32 %a, %b
+  ret i32 %res
+; CHECK-LABEL: sdiv_i32:
+; CHECK: cdq
+; CHECK: idiv
+}
+
+define i32 @srem_i8(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i8
+  %b = trunc i32 %b.i32 to i8
+  %res = srem i8 %a, %b
+  %res.i32 = zext i8 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: srem_i8:
+; CHECK: cbw
+; CHECK: idiv
+}
+
+define i32 @srem_i16(i32 %a.i32, i32 %b.i32) {
+entry:
+  %a = trunc i32 %a.i32 to i16
+  %b = trunc i32 %b.i32 to i16
+  %res = srem i16 %a, %b
+  %res.i32 = zext i16 %res to i32
+  ret i32 %res.i32
+; CHECK-LABEL: srem_i16:
+; CHECK: cwd
+; CHECK: idiv
+}
+
+define i32 @srem_i32(i32 %a, i32 %b) {
+entry:
+  %res = srem i32 %a, %b
+  ret i32 %res
+; CHECK-LABEL: srem_i32:
+; CHECK: cdq
+; CHECK: idiv
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/vector-arith.ll b/tests_lit/llvm2ice_tests/vector-arith.ll
index 94acfe0..e093f97 100644
--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -56,7 +56,10 @@
   %res = frem <4 x float> %arg0, %arg1
   ret <4 x float> %res
 ; CHECK-LABEL: test_frem:
-; CHECK: Sz_frem_v4f32
+; CHECK: fmodf
+; CHECK: fmodf
+; CHECK: fmodf
+; CHECK: fmodf
 }
 
 define <16 x i8> @test_add_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -104,7 +107,22 @@
   %res = mul <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_mul_v16i8:
-; CHECK: Sz_mul_v16i8
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
+; CHECK: imul
 }
 
 define <16 x i8> @test_shl_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -112,7 +130,22 @@
   %res = shl <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_shl_v16i8:
-; CHECK: Sz_shl_v16i8
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
 }
 
 define <16 x i8> @test_lshr_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -120,7 +153,22 @@
   %res = lshr <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_lshr_v16i8:
-; CHECK: Sz_lshr_v16i8
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }
 
 define <16 x i8> @test_ashr_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -128,7 +176,22 @@
   %res = ashr <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_ashr_v16i8:
-; CHECK: Sz_ashr_v16i8
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }
 
 define <16 x i8> @test_udiv_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -136,7 +199,22 @@
   %res = udiv <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_udiv_v16i8:
-; CHECK: Sz_udiv_v16i8
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <16 x i8> @test_sdiv_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -144,7 +222,22 @@
   %res = sdiv <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_sdiv_v16i8:
-; CHECK: Sz_sdiv_v16i8
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 define <16 x i8> @test_urem_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -152,7 +245,22 @@
   %res = urem <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_urem_v16i8:
-; CHECK: Sz_urem_v16i8
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <16 x i8> @test_srem_v16i8(<16 x i8> %arg0, <16 x i8> %arg1) {
@@ -160,7 +268,22 @@
   %res = srem <16 x i8> %arg0, %arg1
   ret <16 x i8> %res
 ; CHECK-LABEL: test_srem_v16i8:
-; CHECK: Sz_srem_v16i8
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 define <8 x i16> @test_add_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -216,7 +339,14 @@
   %res = shl <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_shl_v8i16:
-; CHECK: Sz_shl_v8i16
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
 }
 
 define <8 x i16> @test_lshr_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -224,7 +354,14 @@
   %res = lshr <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_lshr_v8i16:
-; CHECK: Sz_lshr_v8i16
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }
 
 define <8 x i16> @test_ashr_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -232,7 +369,14 @@
   %res = ashr <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_ashr_v8i16:
-; CHECK: Sz_ashr_v8i16
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }
 
 define <8 x i16> @test_udiv_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -240,7 +384,14 @@
   %res = udiv <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_udiv_v8i16:
-; CHECK: Sz_udiv_v8i16
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <8 x i16> @test_sdiv_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -248,7 +399,14 @@
   %res = sdiv <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_sdiv_v8i16:
-; CHECK: Sz_sdiv_v8i16
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 define <8 x i16> @test_urem_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -256,7 +414,14 @@
   %res = urem <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_urem_v8i16:
-; CHECK: Sz_urem_v8i16
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <8 x i16> @test_srem_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
@@ -264,7 +429,14 @@
   %res = srem <8 x i16> %arg0, %arg1
   ret <8 x i16> %res
 ; CHECK-LABEL: test_srem_v8i16:
-; CHECK: Sz_srem_v8i16
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 define <4 x i32> @test_add_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -324,7 +496,10 @@
   %res = shl <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_shl_v4i32:
-; CHECK: Sz_shl_v4i32
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
+; CHECK: shl
 
 ; This line is to ensure that pmulld is generated in test_mul_v4i32 above.
 ; SSE41-LABEL: test_shl_v4i32:
@@ -335,7 +510,10 @@
   %res = lshr <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_lshr_v4i32:
-; CHECK: Sz_lshr_v4i32
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
+; CHECK: shr
 }
 
 define <4 x i32> @test_ashr_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -343,7 +521,10 @@
   %res = ashr <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_ashr_v4i32:
-; CHECK: Sz_ashr_v4i32
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
+; CHECK: sar
 }
 
 define <4 x i32> @test_udiv_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -351,7 +532,10 @@
   %res = udiv <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_udiv_v4i32:
-; CHECK: Sz_udiv_v4i32
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <4 x i32> @test_sdiv_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -359,7 +543,10 @@
   %res = sdiv <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_sdiv_v4i32:
-; CHECK: Sz_sdiv_v4i32
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 define <4 x i32> @test_urem_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -367,7 +554,10 @@
   %res = urem <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_urem_v4i32:
-; CHECK: Sz_urem_v4i32
+; CHECK: div
+; CHECK: div
+; CHECK: div
+; CHECK: div
 }
 
 define <4 x i32> @test_srem_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
@@ -375,7 +565,10 @@
   %res = srem <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
 ; CHECK-LABEL: test_srem_v4i32:
-; CHECK: Sz_srem_v4i32
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
+; CHECK: idiv
 }
 
 ; ERRORS-NOT: ICE translation error