Subzero: Don't use key SSE instructions on potentially unaligned loads.
The non-mov-like SSE instructions generally require 16-byte aligned memory operands. The PNaCl bitcode ABI only guarantees 4-byte alignment or less on vector loads and stores. Subzero maintains stack alignment so stack memory operands are fine.
We handle this by legalizing memory operands into a register wherever there is doubt.
This bug was first discovered on the vector_align scons test.
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4083
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4133
R=jvoung@chromium.org
Review URL: https://codereview.chromium.org/1024253003
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 6dad487..cebcc55 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -273,6 +273,29 @@
static bool isClassof(const Inst *Inst, InstKindX8632 MyKind) {
return Inst->getKind() == static_cast<InstKind>(MyKind);
}
+ // Most instructions that operate on vector arguments require vector
+ // memory operands to be fully aligned (16-byte alignment for PNaCl
+ // vector types). The stack frame layout and call ABI ensure proper
+ // alignment for stack operands, but memory operands (originating
+ // from load/store bitcode instructions) only have element-size
+ // alignment guarantees. This function validates that none of the
+ // operands is a memory operand of vector type, calling
+ // report_fatal_error() if one is found. This function should be
+ // called during emission, and maybe also in the ctor (as long as
+ // that fits the lowering style).
+ void validateVectorAddrMode() const {
+ if (getDest())
+ validateVectorAddrModeOpnd(getDest());
+ for (SizeT i = 0; i < getSrcSize(); ++i) {
+ validateVectorAddrModeOpnd(getSrc(i));
+ }
+ }
+private:
+ static void validateVectorAddrModeOpnd(const Operand *Opnd) {
+ if (llvm::isa<OperandX8632Mem>(Opnd) && isVectorType(Opnd->getType())) {
+ llvm::report_fatal_error("Possible misaligned vector memory operation");
+ }
+ }
};
// InstX8632Label represents an intra-block label that is the target
@@ -752,10 +775,12 @@
void emit(const Cfg *Func) const override {
if (!ALLOW_DUMP)
return;
+ validateVectorAddrMode();
const bool ShiftHack = false;
emitTwoAddress(Opcode, this, Func, ShiftHack);
}
void emitIAS(const Cfg *Func) const override {
+ validateVectorAddrMode();
Type Ty = getDest()->getType();
if (NeedsElementType)
Ty = typeElementType(Ty);
@@ -803,10 +828,12 @@
void emit(const Cfg *Func) const override {
if (!ALLOW_DUMP)
return;
+ validateVectorAddrMode();
const bool ShiftHack = false;
emitTwoAddress(Opcode, this, Func, ShiftHack);
}
void emitIAS(const Cfg *Func) const override {
+ validateVectorAddrMode();
Type Ty = getDest()->getType();
assert(AllowAllTypes || isVectorType(Ty));
Type ElementTy = typeElementType(Ty);
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index e99516b..662ac54 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1375,6 +1375,8 @@
} else if (isVectorType(Dest->getType())) {
// TODO: Trap on integer divide and integer modulo by zero.
// See: https://code.google.com/p/nativeclient/issues/detail?id=3899
+ if (llvm::isa<OperandX8632Mem>(Src1))
+ Src1 = legalizeToVar(Src1);
switch (Inst->getOp()) {
case InstArithmetic::_num:
llvm_unreachable("Unknown arithmetic operator");
@@ -2090,6 +2092,8 @@
assert(Dest->getType() == IceType_v4i32 &&
Inst->getSrc(0)->getType() == IceType_v4f32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+ if (llvm::isa<OperandX8632Mem>(Src0RM))
+ Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
_movp(Dest, T);
@@ -2165,6 +2169,8 @@
assert(Dest->getType() == IceType_v4f32 &&
Inst->getSrc(0)->getType() == IceType_v4i32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+ if (llvm::isa<OperandX8632Mem>(Src0RM))
+ Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
_movp(Dest, T);
@@ -2472,6 +2478,8 @@
} else {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+ if (llvm::isa<OperandX8632Mem>(Src1RM))
+ Src1RM = legalizeToVar(Src1RM);
switch (Condition) {
default: {
@@ -2609,10 +2617,14 @@
llvm_unreachable("unexpected condition");
break;
case InstIcmp::Eq: {
+ if (llvm::isa<OperandX8632Mem>(Src1RM))
+ Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpeq(T, Src1RM);
} break;
case InstIcmp::Ne: {
+ if (llvm::isa<OperandX8632Mem>(Src1RM))
+ Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpeq(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -2620,12 +2632,16 @@
} break;
case InstIcmp::Ugt:
case InstIcmp::Sgt: {
+ if (llvm::isa<OperandX8632Mem>(Src1RM))
+ Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpgt(T, Src1RM);
} break;
case InstIcmp::Uge:
case InstIcmp::Sge: {
// !(Src1RM > Src0RM)
+ if (llvm::isa<OperandX8632Mem>(Src0RM))
+ Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM);
_pcmpgt(T, Src0RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -2633,12 +2649,16 @@
} break;
case InstIcmp::Ult:
case InstIcmp::Slt: {
+ if (llvm::isa<OperandX8632Mem>(Src0RM))
+ Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM);
_pcmpgt(T, Src0RM);
} break;
case InstIcmp::Ule:
case InstIcmp::Sle: {
// !(Src0RM > Src1RM)
+ if (llvm::isa<OperandX8632Mem>(Src1RM))
+ Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpgt(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -3092,8 +3112,12 @@
Variable *T = makeVectorOfFabsMask(Ty);
// The pand instruction operates on an m128 memory operand, so if
// Src is an f32 or f64, we need to make sure it's in a register.
- if (!isVectorType(Ty))
+ if (isVectorType(Ty)) {
+ if (llvm::isa<OperandX8632Mem>(Src))
+ Src = legalizeToVar(Src);
+ } else {
Src = legalizeToVar(Src);
+ }
_pand(T, Src);
if (isVectorType(Ty))
_movp(Dest, T);
diff --git a/tests_lit/llvm2ice_tests/address-mode-opt.ll b/tests_lit/llvm2ice_tests/address-mode-opt.ll
index c8c2802..6f196c9 100644
--- a/tests_lit/llvm2ice_tests/address-mode-opt.ll
+++ b/tests_lit/llvm2ice_tests/address-mode-opt.ll
@@ -56,8 +56,9 @@
%arg1 = load <8 x i16>* %addr_ptr, align 2
%res_vec = mul <8 x i16> %arg0, %arg1
ret <8 x i16> %res_vec
+; Address mode optimization is generally unsafe for SSE vector instructions.
; CHECK-LABEL: load_mul_v8i16_mem
-; CHECK: pmullw xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; CHECK-NOT: pmullw xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
}
define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
@@ -67,12 +68,13 @@
%arg1 = load <4 x i32>* %addr_ptr, align 4
%res = mul <4 x i32> %arg0, %arg1
ret <4 x i32> %res
+; Address mode optimization is generally unsafe for SSE vector instructions.
; CHECK-LABEL: load_mul_v4i32_mem
-; CHECK: pmuludq xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; CHECK-NOT: pmuludq xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
; CHECK: pmuludq
;
; SSE41-LABEL: load_mul_v4i32_mem
-; SSE41: pmulld xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; SSE41-NOT: pmulld xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
}
define float @address_mode_opt_chaining(float* %arg) {
diff --git a/tests_lit/llvm2ice_tests/vector-align.ll b/tests_lit/llvm2ice_tests/vector-align.ll
new file mode 100644
index 0000000..4964f6c
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-align.ll
@@ -0,0 +1,85 @@
+; This test checks that when SSE instructions access memory and require full
+; alignment, memory operands are limited to properly aligned stack operands.
+; This would only happen when we fuse a load instruction with another
+; instruction, which currently only happens with non-scalarized Arithmetic
+; instructions.
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 | FileCheck %s
+
+define <4 x i32> @test_add(i32 %addr_i, <4 x i32> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x i32>*
+ %loaded = load <4 x i32>* %addr, align 4
+ %result = add <4 x i32> %addend, %loaded
+ ret <4 x i32> %result
+}
+; CHECK-LABEL: test_add
+; CHECK-NOT: paddd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: paddd xmm{{.}},
+
+define <4 x i32> @test_and(i32 %addr_i, <4 x i32> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x i32>*
+ %loaded = load <4 x i32>* %addr, align 4
+ %result = and <4 x i32> %addend, %loaded
+ ret <4 x i32> %result
+}
+; CHECK-LABEL: test_and
+; CHECK-NOT: pand xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: pand xmm{{.}},
+
+define <4 x i32> @test_or(i32 %addr_i, <4 x i32> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x i32>*
+ %loaded = load <4 x i32>* %addr, align 4
+ %result = or <4 x i32> %addend, %loaded
+ ret <4 x i32> %result
+}
+; CHECK-LABEL: test_or
+; CHECK-NOT: por xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: por xmm{{.}},
+
+define <4 x i32> @test_xor(i32 %addr_i, <4 x i32> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x i32>*
+ %loaded = load <4 x i32>* %addr, align 4
+ %result = xor <4 x i32> %addend, %loaded
+ ret <4 x i32> %result
+}
+; CHECK-LABEL: test_xor
+; CHECK-NOT: pxor xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: pxor xmm{{.}},
+
+define <4 x i32> @test_sub(i32 %addr_i, <4 x i32> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x i32>*
+ %loaded = load <4 x i32>* %addr, align 4
+ %result = sub <4 x i32> %addend, %loaded
+ ret <4 x i32> %result
+}
+; CHECK-LABEL: test_sub
+; CHECK-NOT: psubd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: psubd xmm{{.}},
+
+define <4 x float> @test_fadd(i32 %addr_i, <4 x float> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x float>*
+ %loaded = load <4 x float>* %addr, align 4
+ %result = fadd <4 x float> %addend, %loaded
+ ret <4 x float> %result
+}
+; CHECK-LABEL: test_fadd
+; CHECK-NOT: addps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: addps xmm{{.}},
+
+define <4 x float> @test_fsub(i32 %addr_i, <4 x float> %addend) {
+entry:
+ %addr = inttoptr i32 %addr_i to <4 x float>*
+ %loaded = load <4 x float>* %addr, align 4
+ %result = fsub <4 x float> %addend, %loaded
+ ret <4 x float> %result
+}
+; CHECK-LABEL: test_fsub
+; CHECK-NOT: subps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: subps xmm{{.}},