Add x86 vector packing instructions.

BUG=swiftshader:15

Change-Id: I0d40fab6287130143693e8e4752859b7142a503d
Reviewed-on: https://chromium-review.googlesource.com/394007
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index c9baf23..be0ff00 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -437,6 +437,10 @@
               const Immediate &mask);
   void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
   void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
+  void packss(Type Ty, XmmRegister Dst, XmmRegister Src);
+  void packss(Type Ty, XmmRegister Dst, const Address &Src);
+  void packus(Type Ty, XmmRegister Dst, XmmRegister Src);
+  void packus(Type Ty, XmmRegister Dst, const Address &Src);
   void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void shufps(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index e5819ce..11da946 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1628,6 +1628,78 @@
 }
 
 template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::packss(Type Ty, XmmRegister Dst,
+                                          XmmRegister Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, Dst, Src);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x6B);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x63);
+  } else {
+    assert(false && "Unexpected vector pack operand type");
+  }
+  emitXmmRegisterOperand(Dst, Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::packss(Type Ty, XmmRegister Dst,
+                                          const Address &Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, Src, Dst);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x6B);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x63);
+  } else {
+    assert(false && "Unexpected vector pack operand type");
+  }
+  emitOperand(gprEncoding(Dst), Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::packus(Type Ty, XmmRegister Dst,
+                                          XmmRegister Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, Dst, Src);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x38);
+    emitUint8(0x2B);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x67);
+  } else {
+    assert(false && "Unexpected vector pack operand type");
+  }
+  emitXmmRegisterOperand(Dst, Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::packus(Type Ty, XmmRegister Dst,
+                                          const Address &Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, Src, Dst);
+  emitUint8(0x0F);
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
+    emitUint8(0x38);
+    emitUint8(0x2B);
+  } else if (Ty == IceType_v8i16) {
+    emitUint8(0x67);
+  } else {
+    assert(false && "Unexpected vector pack operand type");
+  }
+  emitOperand(gprEncoding(Dst), Src);
+}
+
+template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::shufps(Type /* Ty */, XmmRegister dst,
                                           XmmRegister src,
                                           const Immediate &imm) {
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index a0ff546..00a455c 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -50,8 +50,8 @@
 
 const TargetX8632Traits::TypeAttributesType
     TargetX8632Traits::TypeAttributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
-  { cvt, sdss, pdps, spsd, pack, unpack, width, fld }                          \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
+  { cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld }                    \
   ,
         ICETYPEX8632_TABLE
 #undef X
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index 173164f..c71d6df 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -212,22 +212,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8632_TABLE                                                     \
-  /* tag,  element type, cvt , sdss, pdps, spsd, pack, unpack, width, fld */   \
-  X(void,  void,         "?",  "",   "",   "",   "",   "",     "",    "")      \
-  X(i1,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
-  X(i8,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
-  X(i16,   void,         "si", "",   "",   "",   "",   "",     "w",   "")      \
-  X(i32,   void,         "si", "",   "",   "",   "",   "",     "l",   "")      \
-  X(i64,   void,         "si", "",   "",   "",   "",   "",     "q",   "")      \
-  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",     "",    "s")     \
-  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",     "",    "l")     \
-  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "dq",   "",    "")      \
-  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
-  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
-  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
-  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
-  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "dq",   "",    "")      \
-  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "dq",   "",    "")
-//#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)
+  /* tag,  elty, cvt,  sdss, pdps, spsd, int_, unpack, pack, width, fld */     \
+  X(void,  void, "?",  "",   "",   "",   "",   "",     "",   "",    "")        \
+  X(i1,    void, "si", "",   "",   "",   "",   "",     "",   "b",   "")        \
+  X(i8,    void, "si", "",   "",   "",   "",   "",     "",   "b",   "")        \
+  X(i16,   void, "si", "",   "",   "",   "",   "",     "",   "w",   "")        \
+  X(i32,   void, "si", "",   "",   "",   "",   "",     "",   "l",   "")        \
+  X(i64,   void, "si", "",   "",   "",   "",   "",     "",   "q",   "")        \
+  X(f32,   void, "ss", "ss", "ps", "ss", "d",  "",     "",   "",    "s")       \
+  X(f64,   void, "sd", "sd", "pd", "sd", "q",  "",     "",   "",    "l")       \
+  X(v4i1,  i32,  "?",  "",   "",   "",   "d",  "dq",   "",   "",    "")        \
+  X(v8i1,  i16,  "?",  "",   "",   "",   "w",  "wd",   "",   "",    "")        \
+  X(v16i1, i8,   "?",  "",   "",   "",   "b",  "bw",   "",   "",    "")        \
+  X(v16i8, i8,   "?",  "",   "",   "",   "b",  "bw",   "",   "",    "")        \
+  X(v8i16, i16,  "?",  "",   "",   "",   "w",  "wd",   "wb", "",    "")        \
+  X(v4i32, i32,  "dq", "",   "",   "",   "d",  "dq",   "dw", "",    "")        \
+  X(v4f32, f32,  "ps", "",   "ps", "ps", "d",  "dq",   "",   "",    "")
+//#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
diff --git a/src/IceInstX8664.cpp b/src/IceInstX8664.cpp
index d83c47e..d44f35d 100644
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -51,8 +51,8 @@
 
 const TargetX8664Traits::TypeAttributesType
     TargetX8664Traits::TypeAttributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
-  { cvt, sdss, pdps, spsd, pack, unpack, width, fld }                          \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
+  { cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld }                    \
   ,
         ICETYPEX8664_TABLE
 #undef X
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index caee1ad..db97dbf 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -295,22 +295,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8664_TABLE                                                     \
-  /* tag,  element type, cvt , sdss, pdps, spsd, pack, unpack, width, fld */   \
-  X(void,  void,         "?",  "",   "",   "",   "",   "",     "",    "")      \
-  X(i1,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
-  X(i8,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
-  X(i16,   void,         "si", "",   "",   "",   "",   "",     "w",   "")      \
-  X(i32,   void,         "si", "",   "",   "",   "",   "",     "l",   "")      \
-  X(i64,   void,         "si", "",   "",   "",   "",   "",     "q",   "")      \
-  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",     "",    "s")     \
-  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",     "",    "l")     \
-  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "dq",   "",    "")      \
-  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
-  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
-  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
-  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
-  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "dq",   "",    "")      \
-  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "dq",   "",    "")
-//#define X(tag, elementty, cvt, sdss, pdps, pack, unpack, width, fld)
+  /* tag,  elty, cvt,  sdss, pdps, spsd, int_, unpack, pack, width, fld */     \
+  X(void,  void, "?",  "",   "",   "",   "",   "",     "",   "",    "")        \
+  X(i1,    void, "si", "",   "",   "",   "",   "",     "",   "b",   "")        \
+  X(i8,    void, "si", "",   "",   "",   "",   "",     "",   "b",   "")        \
+  X(i16,   void, "si", "",   "",   "",   "",   "",     "",   "w",   "")        \
+  X(i32,   void, "si", "",   "",   "",   "",   "",     "",   "l",   "")        \
+  X(i64,   void, "si", "",   "",   "",   "",   "",     "",   "q",   "")        \
+  X(f32,   void, "ss", "ss", "ps", "ss", "d",  "",     "",   "",    "s")       \
+  X(f64,   void, "sd", "sd", "pd", "sd", "q",  "",     "",   "",    "l")       \
+  X(v4i1,  i32,  "?",  "",   "",   "",   "d",  "dq",   "",   "",    "")        \
+  X(v8i1,  i16,  "?",  "",   "",   "",   "w",  "wd",   "",   "",    "")        \
+  X(v16i1, i8,   "?",  "",   "",   "",   "b",  "bw",   "",   "",    "")        \
+  X(v16i8, i8,   "?",  "",   "",   "",   "b",  "bw",   "",   "",    "")        \
+  X(v8i16, i16,  "?",  "",   "",   "",   "w",  "wd",   "wb", "",    "")        \
+  X(v4i32, i32,  "dq", "",   "",   "",   "d",  "dq",   "dw", "",    "")        \
+  X(v4f32, f32,  "ps", "",   "ps", "ps", "d",  "dq",   "",   "",    "")
+//#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8664_DEF
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 1ce6d6f..7e96de5 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -145,6 +145,8 @@
       Pshufb,
       Pshufd,
       Punpckl,
+      Packss,
+      Packus,
       Psll,
       Psra,
       Psrl,
@@ -186,7 +188,7 @@
       IacaEnd
     };
 
-    enum SseSuffix { None, Packed, Unpack, Scalar, Integral };
+    enum SseSuffix { None, Packed, Unpack, Scalar, Integral, Pack };
 
     static const char *getWidthString(Type Ty);
     static const char *getFldString(Type Ty);
@@ -878,6 +880,9 @@
         SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
         break;
       case InstX86Base::SseSuffix::Integral:
+        SuffixString = Traits::TypeAttributes[DestTy].IntegralString;
+        break;
+      case InstX86Base::SseSuffix::Pack:
         SuffixString = Traits::TypeAttributes[DestTy].PackString;
         break;
       }
@@ -934,7 +939,7 @@
       // Shift operations are always integral, and hence always need a suffix.
       const Type DestTy = this->getDest()->getType();
       this->emitTwoAddress(Func, this->Opcode,
-                           Traits::TypeAttributes[DestTy].PackString);
+                           Traits::TypeAttributes[DestTy].IntegralString);
     }
     void emitIAS(const Cfg *Func) const override {
       this->validateVectorAddrMode();
@@ -2927,6 +2932,38 @@
                                                               Source) {}
   };
 
+  class InstX86Packss
+      : public InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                                   InstX86Base::SseSuffix::Pack> {
+  public:
+    static InstX86Packss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+      return new (Func->allocate<InstX86Packss>())
+          InstX86Packss(Func, Dest, Source);
+    }
+
+  private:
+    InstX86Packss(Cfg *Func, Variable *Dest, Operand *Source)
+        : InstX86BaseBinopXmm<InstX86Base::Packss, false,
+                              InstX86Base::SseSuffix::Pack>(Func, Dest,
+                                                            Source) {}
+  };
+
+  class InstX86Packus
+      : public InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                                   InstX86Base::SseSuffix::Pack> {
+  public:
+    static InstX86Packus *create(Cfg *Func, Variable *Dest, Operand *Source) {
+      return new (Func->allocate<InstX86Packus>())
+          InstX86Packus(Func, Dest, Source);
+    }
+
+  private:
+    InstX86Packus(Cfg *Func, Variable *Dest, Operand *Source)
+        : InstX86BaseBinopXmm<InstX86Base::Packus, false,
+                              InstX86Base::SseSuffix::Pack>(Func, Dest,
+                                                            Source) {}
+  };
+
 }; // struct InstImpl
 
 /// struct Insts is a template that can be used to instantiate all the X86
@@ -3052,6 +3089,8 @@
 
   using Pshufb = typename InstImpl<TraitsType>::InstX86Pshufb;
   using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
+  using Packss = typename InstImpl<TraitsType>::InstX86Packss;
+  using Packus = typename InstImpl<TraitsType>::InstX86Packus;
 };
 
 /// X86 Instructions have static data (particularly, opcodes and instruction
@@ -3287,6 +3326,12 @@
   template <>                                                                  \
   template <>                                                                  \
   const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode = "punpckl";  \
+  template <>                                                                  \
+  template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Packss::Base::Opcode = "packss";    \
+  template <>                                                                  \
+  template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Packus::Base::Opcode = "packus";    \
   /* Inplace GPR ops */                                                        \
   template <>                                                                  \
   template <>                                                                  \
@@ -3660,6 +3705,18 @@
       InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
           &InstImpl<TraitsType>::Assembler::punpckl,                           \
           &InstImpl<TraitsType>::Assembler::punpckl};                          \
+  template <>                                                                  \
+  template <>                                                                  \
+  const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
+      InstImpl<TraitsType>::InstX86Packss::Base::Emitter = {                   \
+          &InstImpl<TraitsType>::Assembler::packss,                            \
+          &InstImpl<TraitsType>::Assembler::packss};                           \
+  template <>                                                                  \
+  template <>                                                                  \
+  const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
+      InstImpl<TraitsType>::InstX86Packus::Base::Emitter = {                   \
+          &InstImpl<TraitsType>::Assembler::packus,                            \
+          &InstImpl<TraitsType>::Assembler::packus};                           \
   }                                                                            \
   }
 
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 1bc2a0c..5c6c005 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -2607,7 +2607,8 @@
   assert(this->getSrcSize() == 2);
   // pextrb and pextrd are SSE4.1 instructions.
   Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getSrc(0)->getType()].PackString << "\t";
+      << Traits::TypeAttributes[this->getSrc(0)->getType()].IntegralString
+      << "\t";
   this->getSrc(1)->emit(Func);
   Str << ", ";
   this->getSrc(0)->emit(Func);
@@ -2646,7 +2647,8 @@
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(this->getSrcSize() == 3);
   Str << "\t" << this->Opcode
-      << Traits::TypeAttributes[this->getDest()->getType()].PackString << "\t";
+      << Traits::TypeAttributes[this->getDest()->getType()].IntegralString
+      << "\t";
   this->getSrc(2)->emit(Func);
   Str << ", ";
   Operand *Src1 = this->getSrc(1);
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index fd1cf75..36e5964 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -101,8 +101,8 @@
 
 const TargetX8632Traits::TableTypeX8632AttributesType
     TargetX8632Traits::TableTypeX8632Attributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
-  { IceType_##elementty }                                                      \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
+  { IceType_##elty }                                                           \
   ,
         ICETYPEX8632_TABLE
 #undef X
@@ -462,7 +462,7 @@
 namespace dummy3 {
 // Define a temporary set of enum values based on low-level table entries.
 enum _tmp_enum {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
   _tmp_##tag,
   ICETYPEX8632_TABLE
 #undef X
@@ -475,7 +475,7 @@
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
   static const int _table2_##tag = _tmp_##tag;                                 \
   static_assert(_table1_##tag == _table2_##tag,                                \
                 "Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 334dbdc..8844519 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -946,14 +946,15 @@
   } InstCmppsAttributes[];
 
   static const struct TypeAttributesType {
-    const char *CvtString;    // i (integer), s (single FP), d (double FP)
-    const char *SdSsString;   // ss, sd, or <blank>
-    const char *PdPsString;   // ps, pd, or <blank>
-    const char *SpsdString;   // ss, sd, ps, pd, or <blank>
-    const char *PackString;   // b, w, d, or <blank>
-    const char *UnpackString; // bw, wd, dq, or <blank>
-    const char *WidthString;  // b, w, l, q, or <blank>
-    const char *FldString;    // s, l, or <blank>
+    const char *CvtString;      // i (integer), s (single FP), d (double FP)
+    const char *SdSsString;     // ss, sd, or <blank>
+    const char *PdPsString;     // ps, pd, or <blank>
+    const char *SpsdString;     // ss, sd, ps, pd, or <blank>
+    const char *IntegralString; // b, w, d, or <blank>
+    const char *UnpackString;   // bw, wd, dq, or <blank>
+    const char *PackString;     // wb, dw, or <blank>
+    const char *WidthString;    // b, w, l, q, or <blank>
+    const char *FldString;      // s, l, or <blank>
   } TypeAttributes[];
 
   static const char *InstSegmentRegNames[];
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 66a1581..df454b0 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -94,8 +94,8 @@
 
 const TargetX8664Traits::TableTypeX8664AttributesType
     TargetX8664Traits::TableTypeX8664Attributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
-  { IceType_##elementty }                                                      \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
+  { IceType_##elty }                                                           \
   ,
         ICETYPEX8664_TABLE
 #undef X
@@ -801,7 +801,7 @@
 namespace dummy3 {
 // Define a temporary set of enum values based on low-level table entries.
 enum _tmp_enum {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
   _tmp_##tag,
   ICETYPEX8664_TABLE
 #undef X
@@ -814,7 +814,7 @@
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+#define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
   static const int _table2_##tag = _tmp_##tag;                                 \
   static_assert(_table1_##tag == _table2_##tag,                                \
                 "Inconsistency between ICETYPEX8664_TABLE and ICETYPE_TABLE");
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 27d8b5f..5e3e3c1 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -1022,14 +1022,15 @@
   } InstCmppsAttributes[];
 
   static const struct TypeAttributesType {
-    const char *CvtString;    // i (integer), s (single FP), d (double FP)
-    const char *SdSsString;   // ss, sd, or <blank>
-    const char *PdPsString;   // ps, pd, or <blank>
-    const char *SpSdString;   // ss, sd, ps, pd, or <blank>
-    const char *PackString;   // b, w, d, or <blank>
-    const char *UnpackString; // bw, wd, dq, or <blank>
-    const char *WidthString;  // b, w, l, q, or <blank>
-    const char *FldString;    // s, l, or <blank>
+    const char *CvtString;      // i (integer), s (single FP), d (double FP)
+    const char *SdSsString;     // ss, sd, or <blank>
+    const char *PdPsString;     // ps, pd, or <blank>
+    const char *SpSdString;     // ss, sd, ps, pd, or <blank>
+    const char *IntegralString; // b, w, d, or <blank>
+    const char *UnpackString;   // bw, wd, dq, or <blank>
+    const char *PackString;     // wb, dw, or <blank>
+    const char *WidthString;    // b, w, l, q, or <blank>
+    const char *FldString;      // s, l, or <blank>
   } TypeAttributes[];
 };
 
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index a7c89f9..0f31dd4 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -815,6 +815,14 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
   }
+  void _packss(Variable *Dest, Operand *Src0) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Packss>(Dest, Src0);
+  }
+  void _packus(Variable *Dest, Operand *Src0) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Packus>(Dest, Src0);
+  }
   void _pshufb(Variable *Dest, Operand *Src0) {
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Pshufb>(Dest, Src0);
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index 1571874..1c85e2b 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -1103,6 +1103,160 @@
 #undef TestImplXmmXmm
 }
 
+TEST_F(AssemblerX8632Test, Packss) {
+  const Dqword V0_v4i32(uint64_t(0x0001000000001234ull),
+                        uint64_t(0x7FFFFFFF80000000ull));
+  const Dqword V1_v4i32(uint64_t(0xFFFFFFFEFFFFFFFFull),
+                        uint64_t(0x0000800100007FFEull));
+  const Dqword Expected_v4i32(uint64_t(0x7FFF80007FFF1234ull),
+                              uint64_t(0x7FFF7FFEFFFEFFFFull));
+
+  const Dqword V0_v8i16(uint64_t(0x0001000000120034ull),
+                        uint64_t(0xFFFEFFFF7FFF8000ull));
+  const Dqword V1_v8i16(uint64_t(0x00057FF80081007Eull),
+                        uint64_t(0x0088007700660055ull));
+  const Dqword Expected_v8i16(uint64_t(0xFEFF7F8001001234ull),
+                              uint64_t(0x7F776655057F7F7Eull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, packss, v4i32);                                   \
+    TestImplXmmAddr(Dst, packss, v4i32);                                       \
+    TestImplXmmXmm(Dst, Src, packss, v8i16);                                   \
+    TestImplXmmAddr(Dst, packss, v8i16);                                       \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
+TEST_F(AssemblerX8632Test, Packus) {
+  const Dqword V0_v4i32(uint64_t(0x0001000000001234ull),
+                        uint64_t(0x7FFFFFFF80000000ull));
+  const Dqword V1_v4i32(uint64_t(0xFFFFFFFEFFFFFFFFull),
+                        uint64_t(0x0000800100007FFEull));
+  const Dqword Expected_v4i32(uint64_t(0xFFFF0000FFFF1234ull),
+                              uint64_t(0x80017FFE00000000ull));
+
+  const Dqword V0_v8i16(uint64_t(0x0001000000120034ull),
+                        uint64_t(0xFFFEFFFF7FFF8000ull));
+  const Dqword V1_v8i16(uint64_t(0x00057FF80081007Eull),
+                        uint64_t(0x0088007700660055ull));
+  const Dqword Expected_v8i16(uint64_t(0x0000FF0001001234ull),
+                              uint64_t(0x8877665505FF817Eull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, packus, v4i32);                                   \
+    TestImplXmmAddr(Dst, packus, v4i32);                                       \
+    TestImplXmmXmm(Dst, Src, packus, v8i16);                                   \
+    TestImplXmmAddr(Dst, packus, v8i16);                                       \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8632Test, Pshufb) {
   const Dqword V0(uint64_t(0x1122334455667788ull),
                   uint64_t(0x99aabbccddeeff32ull));
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index cadb88e..4ceed00 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1169,6 +1169,160 @@
 #undef TestImplXmmXmm
 }
 
+TEST_F(AssemblerX8664Test, Packss) {
+  const Dqword V0_v4i32(uint64_t(0x0001000000001234ull),
+                        uint64_t(0x7FFFFFFF80000000ull));
+  const Dqword V1_v4i32(uint64_t(0xFFFFFFFEFFFFFFFFull),
+                        uint64_t(0x0000800100007FFEull));
+  const Dqword Expected_v4i32(uint64_t(0x7FFF80007FFF1234ull),
+                              uint64_t(0x7FFF7FFEFFFEFFFFull));
+
+  const Dqword V0_v8i16(uint64_t(0x0001000000120034ull),
+                        uint64_t(0xFFFEFFFF7FFF8000ull));
+  const Dqword V1_v8i16(uint64_t(0x00057FF80081007Eull),
+                        uint64_t(0x0088007700660055ull));
+  const Dqword Expected_v8i16(uint64_t(0xFEFF7F8001001234ull),
+                              uint64_t(0x7F776655057F7F7Eull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, packss, v4i32);                                   \
+    TestImplXmmAddr(Dst, packss, v4i32);                                       \
+    TestImplXmmXmm(Dst, Src, packss, v8i16);                                   \
+    TestImplXmmAddr(Dst, packss, v8i16);                                       \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
+TEST_F(AssemblerX8664Test, Packus) {
+  const Dqword V0_v4i32(uint64_t(0x0001000000001234ull),
+                        uint64_t(0x7FFFFFFF80000000ull));
+  const Dqword V1_v4i32(uint64_t(0xFFFFFFFEFFFFFFFFull),
+                        uint64_t(0x0000800100007FFEull));
+  const Dqword Expected_v4i32(uint64_t(0xFFFF0000FFFF1234ull),
+                              uint64_t(0x80017FFE00000000ull));
+
+  const Dqword V0_v8i16(uint64_t(0x0001000000120034ull),
+                        uint64_t(0xFFFEFFFF7FFF8000ull));
+  const Dqword V1_v8i16(uint64_t(0x00057FF80081007Eull),
+                        uint64_t(0x0088007700660055ull));
+  const Dqword Expected_v8i16(uint64_t(0x0000FF0001001234ull),
+                              uint64_t(0x8877665505FF817Eull));
+
+#define TestImplXmmXmm(Dst, Src, Inst, Ty)                                     \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", " #Src ", " #Inst ", " #Ty ")";                            \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst, Ty)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] =                                       \
+        "(" #Dst ", Addr, " #Inst ", " #Ty ")";                                \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0_##Ty);                                             \
+    test.setDqwordTo(T1, V1_##Ty);                                             \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString;                \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, packus, v4i32);                                   \
+    TestImplXmmAddr(Dst, packus, v4i32);                                       \
+    TestImplXmmXmm(Dst, Src, packus, v8i16);                                   \
+    TestImplXmmAddr(Dst, packus, v8i16);                                       \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8664Test, Pshufb) {
   const Dqword V0(uint64_t(0x1122334455667788ull),
                   uint64_t(0x99aabbccddeeff32ull));