Use movss to implement insertelement when elements = 4 and index = 0.
This avoids using a pair of shufps instructions as the previous lowering
was doing. Instead, we use movss to copy the element to be inserted
into the lower 32 bits of the destination.
Define InstX8632Movss as a Binop, the class to which it properly
belongs.
BUG=none
R=jvoung@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/412353005
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 1698dfe..7d930c2 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -462,7 +462,6 @@
template <> const char *InstX8632Bsr::Opcode = "bsr";
template <> const char *InstX8632Lea::Opcode = "lea";
template <> const char *InstX8632Movd::Opcode = "movd";
-template <> const char *InstX8632Movss::Opcode = "movss";
template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
// Binary ops
template <> const char *InstX8632Add::Opcode = "add";
@@ -499,6 +498,7 @@
template <> const char *InstX8632Psra::Opcode = "psra";
template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+template <> const char *InstX8632Movss::Opcode = "movss";
// Ternary ops
template <> const char *InstX8632Shufps::Opcode = "shufps";
template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 6760057..ddea6b5 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -552,7 +552,6 @@
typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
-typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
@@ -586,6 +585,13 @@
typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
+// TODO: movss is only a binary operation when the source and dest
+// operands are both registers. In other cases, it behaves like a copy
+// (mov-like) operation. Eventually, InstX8632Movss should assert that
+// both its source and dest operands are registers, and the lowering
+// code should use _mov instead of _movss in cases where a copy
+// operation is intended.
+typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 4a719d4..83dc5bd 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2165,24 +2165,26 @@
// require aligned memory operands until support for stack alignment
// is implemented.
#define ALIGN_HACK(Vect) legalizeToVar((Vect))
- Operand *T = NULL;
+ Variable *T = NULL;
if (Index) {
// The shuffle only needs to occur if the element to be extracted
// is not at the lowest index.
Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
T = makeReg(Ty);
- _pshufd(llvm::cast<Variable>(T), ALIGN_HACK(SourceVectOperand), Mask);
+ _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
} else {
- // TODO(wala): If SourceVectOperand is in memory, express it as
- // mem32 so that the call to legalizeToVar() is made unnecessary.
- // _movd and _movss only take mem32 memory operands.
T = legalizeToVar(SourceVectOperand);
}
if (InVectorElementTy == IceType_i32) {
_movd(ExtractedElement, T);
- } else { // InVectorElementTy == IceType_f32
- // TODO: _mov should be able to be used here.
+ } else { // Ty == Icetype_f32
+ // TODO(wala): _movss is only used here because _mov does not
+ // allow a vector source and a scalar destination. _mov should be
+ // able to be used here.
+ // _movss is a binary instruction, so the FakeDef is needed to
+ // keep the live range analysis consistent.
+ Context.insert(InstFakeDef::create(Func, ExtractedElement));
_movss(ExtractedElement, T);
}
#undef ALIGN_HACK
@@ -2521,6 +2523,7 @@
// Only constant indices are allowed in PNaCl IR.
assert(ElementIndex);
unsigned Index = ElementIndex->getValue();
+ assert(Index < typeNumElements(SourceVectOperand->getType()));
Type Ty = SourceVectOperand->getType();
Type ElementTy = typeElementType(Ty);
@@ -2538,7 +2541,8 @@
}
if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
- // Lower insertelement with 32-bit wide elements using shufps.
+ // Lower insertelement with 32-bit wide elements using shufps or
+ // movss.
// TODO(wala): SSE4.1 has pinsrd and insertps.
Variable *Element = NULL;
if (InVectorElementTy == IceType_f32) {
@@ -2551,6 +2555,14 @@
_movd(Element, T);
}
+ if (Index == 0) {
+ Variable *T = makeReg(Ty);
+ _movp(T, SourceVectOperand);
+ _movss(T, Element);
+ _movp(Inst->getDest(), T);
+ return;
+ }
+
// shufps treats the source and desination operands as vectors of
// four doublewords. The destination's two high doublewords are
// selected from the source operand and the two low doublewords are
@@ -2560,10 +2572,6 @@
// Element[0] is being inserted into SourceVectOperand. Indices are
// ordered from left to right.
//
- // insertelement into index 0 (result is stored in Element):
- // Element := Element[0, 0] SourceVectOperand[0, 1]
- // Element := Element[0, 3] SourceVectOperand[2, 3]
- //
// insertelement into index 1 (result is stored in Element):
// Element := Element[0, 0] SourceVectOperand[0, 0]
// Element := Element[3, 0] SourceVectOperand[2, 3]
@@ -2577,17 +2585,17 @@
// T := SourceVectOperand
// Element := Element[0, 0] T[0, 2]
// T := T[0, 1] Element[3, 0]
- const unsigned char Mask1[4] = {64, 0, 192, 128};
- const unsigned char Mask2[4] = {236, 227, 196, 52};
+ const unsigned char Mask1[3] = {0, 192, 128};
+ const unsigned char Mask2[3] = {227, 196, 52};
- Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index]);
- Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index]);
+ Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
+ Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
// ALIGNHACK: Force vector operands to registers in instructions that
// require aligned memory operands until support for stack alignment
// is implemented.
#define ALIGN_HACK(Vect) legalizeToVar((Vect))
- if (Index < 2) {
+ if (Index == 1) {
SourceVectOperand = ALIGN_HACK(SourceVectOperand);
_shufps(Element, SourceVectOperand, Mask1Constant);
_shufps(Element, SourceVectOperand, Mask2Constant);
diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
index 868438d..86647db 100644
--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -12,20 +12,37 @@
; insertelement operations
-define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) {
+define <4 x float> @insertelement_v4f32_0(<4 x float> %vec, float %elt) {
+entry:
+ %res = insertelement <4 x float> %vec, float %elt, i32 0
+ ret <4 x float> %res
+; CHECK-LABEL: insertelement_v4f32_0:
+; CHECK: movss
+}
+
+define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
+entry:
+ %res = insertelement <4 x i32> %vec, i32 %elt, i32 0
+ ret <4 x i32> %res
+; CHECK-LABEL: insertelement_v4i32_0:
+; CHECK: movss
+}
+
+
+define <4 x float> @insertelement_v4f32_1(<4 x float> %vec, float %elt) {
entry:
%res = insertelement <4 x float> %vec, float %elt, i32 1
ret <4 x float> %res
-; CHECK-LABEL: insertelement_v4f32:
+; CHECK-LABEL: insertelement_v4f32_1:
; CHECK: shufps
; CHECK: shufps
}
-define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) {
+define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
entry:
%res = insertelement <4 x i32> %vec, i32 %elt, i32 1
ret <4 x i32> %res
-; CHECK-LABEL: insertelement_v4i32:
+; CHECK-LABEL: insertelement_v4i32_1:
; CHECK: shufps
; CHECK: shufps
}
@@ -50,12 +67,21 @@
; CHECK: mov
}
-define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) {
+define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
+entry:
+ %elt = trunc i32 %elt.arg to i1
+ %res = insertelement <4 x i1> %vec, i1 %elt, i32 0
+ ret <4 x i1> %res
+; CHECK-LABEL: insertelement_v4i1_0:
+; CHECK: movss
+}
+
+define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
entry:
%elt = trunc i32 %elt.arg to i1
%res = insertelement <4 x i1> %vec, i1 %elt, i32 1
ret <4 x i1> %res
-; CHECK-LABEL: insertelement_v4i1:
+; CHECK-LABEL: insertelement_v4i1_1:
; CHECK: shufps
; CHECK: shufps
}