Use movss to implement insertelement when elements = 4 and index = 0. This avoids using a pair of shufps instructions as the previous lowering was doing. Instead, we use movss to copy the element to be inserted into the lower 32 bits of the destination. Define InstX8632Movss as a Binop, the class to which it properly belongs. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/412353005

commit: cfe5146fc08cd992aed6aefcee5f3d8642b4c2d8 [log] [tgz]
author: Matt Wala <wala@chromium.org> Fri Jul 25 15:57:56 2014 -0700
committer: Matt Wala <wala@chromium.org> Fri Jul 25 15:57:56 2014 -0700
tree: 5fff1cce08ed94d87d776740d680fdcfecd86a29
parent: ce0ca8f8ea1a859cba5f295f1f942eaba9dfe703 [diff]
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 1698dfe..7d930c2 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp

@@ -462,7 +462,6 @@
 template <> const char *InstX8632Bsr::Opcode = "bsr";
 template <> const char *InstX8632Lea::Opcode = "lea";
 template <> const char *InstX8632Movd::Opcode = "movd";
-template <> const char *InstX8632Movss::Opcode = "movss";
 template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
 // Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
@@ -499,6 +498,7 @@
 template <> const char *InstX8632Psra::Opcode = "psra";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+template <> const char *InstX8632Movss::Opcode = "movss";
 // Ternary ops
 template <> const char *InstX8632Shufps::Opcode = "shufps";
 template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";

diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 6760057..ddea6b5 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h

@@ -552,7 +552,6 @@
 typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
 typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
 typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
-typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
@@ -586,6 +585,13 @@
 typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
 typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
 typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
+// TODO: movss is only a binary operation when the source and dest
+// operands are both registers.  In other cases, it behaves like a copy
+// (mov-like) operation.  Eventually, InstX8632Movss should assert that
+// both its source and dest operands are registers, and the lowering
+// code should use _mov instead of _movss in cases where a copy
+// operation is intended.
+typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
 typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 4a719d4..83dc5bd 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -2165,24 +2165,26 @@
     // require aligned memory operands until support for stack alignment
     // is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
-    Operand *T = NULL;
+    Variable *T = NULL;
     if (Index) {
       // The shuffle only needs to occur if the element to be extracted
       // is not at the lowest index.
       Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
       T = makeReg(Ty);
-      _pshufd(llvm::cast<Variable>(T), ALIGN_HACK(SourceVectOperand), Mask);
+      _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
     } else {
-      // TODO(wala): If SourceVectOperand is in memory, express it as
-      // mem32 so that the call to legalizeToVar() is made unnecessary.
-      // _movd and _movss only take mem32 memory operands.
       T = legalizeToVar(SourceVectOperand);
     }
 
     if (InVectorElementTy == IceType_i32) {
       _movd(ExtractedElement, T);
-    } else { // InVectorElementTy == IceType_f32
-      // TODO: _mov should be able to be used here.
+    } else { // Ty == Icetype_f32
+      // TODO(wala): _movss is only used here because _mov does not
+      // allow a vector source and a scalar destination.  _mov should be
+      // able to be used here.
+      // _movss is a binary instruction, so the FakeDef is needed to
+      // keep the live range analysis consistent.
+      Context.insert(InstFakeDef::create(Func, ExtractedElement));
       _movss(ExtractedElement, T);
     }
 #undef ALIGN_HACK
@@ -2521,6 +2523,7 @@
   // Only constant indices are allowed in PNaCl IR.
   assert(ElementIndex);
   unsigned Index = ElementIndex->getValue();
+  assert(Index < typeNumElements(SourceVectOperand->getType()));
 
   Type Ty = SourceVectOperand->getType();
   Type ElementTy = typeElementType(Ty);
@@ -2538,7 +2541,8 @@
   }
 
   if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Lower insertelement with 32-bit wide elements using shufps.
+    // Lower insertelement with 32-bit wide elements using shufps or
+    // movss.
     // TODO(wala): SSE4.1 has pinsrd and insertps.
     Variable *Element = NULL;
     if (InVectorElementTy == IceType_f32) {
@@ -2551,6 +2555,14 @@
       _movd(Element, T);
     }
 
+    if (Index == 0) {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectOperand);
+      _movss(T, Element);
+      _movp(Inst->getDest(), T);
+      return;
+    }
+
     // shufps treats the source and desination operands as vectors of
     // four doublewords.  The destination's two high doublewords are
     // selected from the source operand and the two low doublewords are
@@ -2560,10 +2572,6 @@
     // Element[0] is being inserted into SourceVectOperand.  Indices are
     // ordered from left to right.
     //
-    // insertelement into index 0 (result is stored in Element):
-    //   Element := Element[0, 0] SourceVectOperand[0, 1]
-    //   Element := Element[0, 3] SourceVectOperand[2, 3]
-    //
     // insertelement into index 1 (result is stored in Element):
     //   Element := Element[0, 0] SourceVectOperand[0, 0]
     //   Element := Element[3, 0] SourceVectOperand[2, 3]
@@ -2577,17 +2585,17 @@
     //   T := SourceVectOperand
     //   Element := Element[0, 0] T[0, 2]
     //   T := T[0, 1] Element[3, 0]
-    const unsigned char Mask1[4] = {64, 0, 192, 128};
-    const unsigned char Mask2[4] = {236, 227, 196, 52};
+    const unsigned char Mask1[3] = {0, 192, 128};
+    const unsigned char Mask2[3] = {227, 196, 52};
 
-    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index]);
-    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index]);
+    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
+    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
 
     // ALIGNHACK: Force vector operands to registers in instructions that
     // require aligned memory operands until support for stack alignment
     // is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
-    if (Index < 2) {
+    if (Index == 1) {
       SourceVectOperand = ALIGN_HACK(SourceVectOperand);
       _shufps(Element, SourceVectOperand, Mask1Constant);
       _shufps(Element, SourceVectOperand, Mask2Constant);

diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
index 868438d..86647db 100644
--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll

@@ -12,20 +12,37 @@
 
 ; insertelement operations
 
-define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) {
+define <4 x float> @insertelement_v4f32_0(<4 x float> %vec, float %elt) {
+entry:
+  %res = insertelement <4 x float> %vec, float %elt, i32 0
+  ret <4 x float> %res
+; CHECK-LABEL: insertelement_v4f32_0:
+; CHECK: movss
+}
+
+define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
+entry:
+  %res = insertelement <4 x i32> %vec, i32 %elt, i32 0
+  ret <4 x i32> %res
+; CHECK-LABEL: insertelement_v4i32_0:
+; CHECK: movss
+}
+
+
+define <4 x float> @insertelement_v4f32_1(<4 x float> %vec, float %elt) {
 entry:
   %res = insertelement <4 x float> %vec, float %elt, i32 1
   ret <4 x float> %res
-; CHECK-LABEL: insertelement_v4f32:
+; CHECK-LABEL: insertelement_v4f32_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }
 
-define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) {
+define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
 entry:
   %res = insertelement <4 x i32> %vec, i32 %elt, i32 1
   ret <4 x i32> %res
-; CHECK-LABEL: insertelement_v4i32:
+; CHECK-LABEL: insertelement_v4i32_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }
@@ -50,12 +67,21 @@
 ; CHECK: mov
 }
 
-define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) {
+define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <4 x i1> %vec, i1 %elt, i32 0
+  ret <4 x i1> %res
+; CHECK-LABEL: insertelement_v4i1_0:
+; CHECK: movss
+}
+
+define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
 entry:
   %elt = trunc i32 %elt.arg to i1
   %res = insertelement <4 x i1> %vec, i1 %elt, i32 1
   ret <4 x i1> %res
-; CHECK-LABEL: insertelement_v4i1:
+; CHECK-LABEL: insertelement_v4i1_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }
commit	cfe5146fc08cd992aed6aefcee5f3d8642b4c2d8	[log] [tgz]
author	Matt Wala <wala@chromium.org>	Fri Jul 25 15:57:56 2014 -0700
committer	Matt Wala <wala@chromium.org>	Fri Jul 25 15:57:56 2014 -0700
tree	5fff1cce08ed94d87d776740d680fdcfecd86a29
parent	ce0ca8f8ea1a859cba5f295f1f942eaba9dfe703 [diff]