Use movss to implement insertelement when elements = 4 and index = 0.

This avoids using a pair of shufps instructions as the previous lowering
was doing.  Instead, we use movss to copy the element to be inserted
into the lower 32 bits of the destination.

Define InstX8632Movss as a Binop, the class to which it properly
belongs.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/412353005
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 1698dfe..7d930c2 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -462,7 +462,6 @@
 template <> const char *InstX8632Bsr::Opcode = "bsr";
 template <> const char *InstX8632Lea::Opcode = "lea";
 template <> const char *InstX8632Movd::Opcode = "movd";
-template <> const char *InstX8632Movss::Opcode = "movss";
 template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
 // Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
@@ -499,6 +498,7 @@
 template <> const char *InstX8632Psra::Opcode = "psra";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+template <> const char *InstX8632Movss::Opcode = "movss";
 // Ternary ops
 template <> const char *InstX8632Shufps::Opcode = "shufps";
 template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";