Subzero: Don't use key SSE instructions on potentially unaligned loads.

The non-mov-like SSE instructions generally require 16-byte aligned memory operands.  The PNaCl bitcode ABI only guarantees 4-byte alignment or less on vector loads and stores.  Subzero maintains stack alignment so stack memory operands are fine.

We handle this by legalizing memory operands into a register wherever there is doubt.

This bug was first discovered on the vector_align scons test.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4083
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4133
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/1024253003
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 6dad487..cebcc55 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -273,6 +273,29 @@
   static bool isClassof(const Inst *Inst, InstKindX8632 MyKind) {
     return Inst->getKind() == static_cast<InstKind>(MyKind);
   }
+  // Most instructions that operate on vector arguments require vector
+  // memory operands to be fully aligned (16-byte alignment for PNaCl
+  // vector types).  The stack frame layout and call ABI ensure proper
+  // alignment for stack operands, but memory operands (originating
+  // from load/store bitcode instructions) only have element-size
+  // alignment guarantees.  This function validates that none of the
+  // operands is a memory operand of vector type, calling
+  // report_fatal_error() if one is found.  This function should be
+  // called during emission, and maybe also in the ctor (as long as
+  // that fits the lowering style).
+  void validateVectorAddrMode() const {
+    if (getDest())
+      validateVectorAddrModeOpnd(getDest());
+    for (SizeT i = 0; i < getSrcSize(); ++i) {
+      validateVectorAddrModeOpnd(getSrc(i));
+    }
+  }
+private:
+  static void validateVectorAddrModeOpnd(const Operand *Opnd) {
+    if (llvm::isa<OperandX8632Mem>(Opnd) && isVectorType(Opnd->getType())) {
+      llvm::report_fatal_error("Possible misaligned vector memory operation");
+    }
+  }
 };
 
 // InstX8632Label represents an intra-block label that is the target
@@ -752,10 +775,12 @@
   void emit(const Cfg *Func) const override {
     if (!ALLOW_DUMP)
       return;
+    validateVectorAddrMode();
     const bool ShiftHack = false;
     emitTwoAddress(Opcode, this, Func, ShiftHack);
   }
   void emitIAS(const Cfg *Func) const override {
+    validateVectorAddrMode();
     Type Ty = getDest()->getType();
     if (NeedsElementType)
       Ty = typeElementType(Ty);
@@ -803,10 +828,12 @@
   void emit(const Cfg *Func) const override {
     if (!ALLOW_DUMP)
       return;
+    validateVectorAddrMode();
     const bool ShiftHack = false;
     emitTwoAddress(Opcode, this, Func, ShiftHack);
   }
   void emitIAS(const Cfg *Func) const override {
+    validateVectorAddrMode();
     Type Ty = getDest()->getType();
     assert(AllowAllTypes || isVectorType(Ty));
     Type ElementTy = typeElementType(Ty);
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index e99516b..662ac54 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1375,6 +1375,8 @@
   } else if (isVectorType(Dest->getType())) {
     // TODO: Trap on integer divide and integer modulo by zero.
     // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
+    if (llvm::isa<OperandX8632Mem>(Src1))
+      Src1 = legalizeToVar(Src1);
     switch (Inst->getOp()) {
     case InstArithmetic::_num:
       llvm_unreachable("Unknown arithmetic operator");
@@ -2090,6 +2092,8 @@
       assert(Dest->getType() == IceType_v4i32 &&
              Inst->getSrc(0)->getType() == IceType_v4f32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
       _movp(Dest, T);
@@ -2165,6 +2169,8 @@
       assert(Dest->getType() == IceType_v4f32 &&
              Inst->getSrc(0)->getType() == IceType_v4i32);
       Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
       Variable *T = makeReg(Dest->getType());
       _cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
       _movp(Dest, T);
@@ -2472,6 +2478,8 @@
     } else {
       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
       Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
 
       switch (Condition) {
       default: {
@@ -2609,10 +2617,14 @@
       llvm_unreachable("unexpected condition");
       break;
     case InstIcmp::Eq: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
       _movp(T, Src0RM);
       _pcmpeq(T, Src1RM);
     } break;
     case InstIcmp::Ne: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
       _movp(T, Src0RM);
       _pcmpeq(T, Src1RM);
       Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -2620,12 +2632,16 @@
     } break;
     case InstIcmp::Ugt:
     case InstIcmp::Sgt: {
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
       _movp(T, Src0RM);
       _pcmpgt(T, Src1RM);
     } break;
     case InstIcmp::Uge:
     case InstIcmp::Sge: {
       // !(Src1RM > Src0RM)
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
       _movp(T, Src1RM);
       _pcmpgt(T, Src0RM);
       Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -2633,12 +2649,16 @@
     } break;
     case InstIcmp::Ult:
     case InstIcmp::Slt: {
+      if (llvm::isa<OperandX8632Mem>(Src0RM))
+        Src0RM = legalizeToVar(Src0RM);
       _movp(T, Src1RM);
       _pcmpgt(T, Src0RM);
     } break;
     case InstIcmp::Ule:
     case InstIcmp::Sle: {
       // !(Src0RM > Src1RM)
+      if (llvm::isa<OperandX8632Mem>(Src1RM))
+        Src1RM = legalizeToVar(Src1RM);
       _movp(T, Src0RM);
       _pcmpgt(T, Src1RM);
       Variable *MinusOne = makeVectorOfMinusOnes(Ty);
@@ -3092,8 +3112,12 @@
     Variable *T = makeVectorOfFabsMask(Ty);
     // The pand instruction operates on an m128 memory operand, so if
     // Src is an f32 or f64, we need to make sure it's in a register.
-    if (!isVectorType(Ty))
+    if (isVectorType(Ty)) {
+      if (llvm::isa<OperandX8632Mem>(Src))
+        Src = legalizeToVar(Src);
+    } else {
       Src = legalizeToVar(Src);
+    }
     _pand(T, Src);
     if (isVectorType(Ty))
       _movp(Dest, T);
diff --git a/tests_lit/llvm2ice_tests/address-mode-opt.ll b/tests_lit/llvm2ice_tests/address-mode-opt.ll
index c8c2802..6f196c9 100644
--- a/tests_lit/llvm2ice_tests/address-mode-opt.ll
+++ b/tests_lit/llvm2ice_tests/address-mode-opt.ll
@@ -56,8 +56,9 @@
   %arg1 = load <8 x i16>* %addr_ptr, align 2
   %res_vec = mul <8 x i16> %arg0, %arg1
   ret <8 x i16> %res_vec
+; Address mode optimization is generally unsafe for SSE vector instructions.
 ; CHECK-LABEL: load_mul_v8i16_mem
-; CHECK: pmullw xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; CHECK-NOT: pmullw xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
 }
 
 define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
@@ -67,12 +68,13 @@
   %arg1 = load <4 x i32>* %addr_ptr, align 4
   %res = mul <4 x i32> %arg0, %arg1
   ret <4 x i32> %res
+; Address mode optimization is generally unsafe for SSE vector instructions.
 ; CHECK-LABEL: load_mul_v4i32_mem
-; CHECK: pmuludq xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; CHECK-NOT: pmuludq xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
 ; CHECK: pmuludq
 ;
 ; SSE41-LABEL: load_mul_v4i32_mem
-; SSE41: pmulld xmm{{.*}},XMMWORD PTR [e{{.*}}-0x30d40]
+; SSE41-NOT: pmulld xmm{{.*}},XMMWORD PTR [e{{..}}-0x30d40]
 }
 
 define float @address_mode_opt_chaining(float* %arg) {
diff --git a/tests_lit/llvm2ice_tests/vector-align.ll b/tests_lit/llvm2ice_tests/vector-align.ll
new file mode 100644
index 0000000..4964f6c
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-align.ll
@@ -0,0 +1,85 @@
+; This test checks that when SSE instructions access memory and require full
+; alignment, memory operands are limited to properly aligned stack operands.
+; This would only happen when we fuse a load instruction with another
+; instruction, which currently only happens with non-scalarized Arithmetic
+; instructions.
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2  | FileCheck %s
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 | FileCheck %s
+
+define <4 x i32> @test_add(i32 %addr_i, <4 x i32> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x i32>*
+  %loaded = load <4 x i32>* %addr, align 4
+  %result = add <4 x i32> %addend, %loaded
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: test_add
+; CHECK-NOT: paddd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: paddd xmm{{.}},
+
+define <4 x i32> @test_and(i32 %addr_i, <4 x i32> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x i32>*
+  %loaded = load <4 x i32>* %addr, align 4
+  %result = and <4 x i32> %addend, %loaded
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: test_and
+; CHECK-NOT: pand xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: pand xmm{{.}},
+
+define <4 x i32> @test_or(i32 %addr_i, <4 x i32> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x i32>*
+  %loaded = load <4 x i32>* %addr, align 4
+  %result = or <4 x i32> %addend, %loaded
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: test_or
+; CHECK-NOT: por xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: por xmm{{.}},
+
+define <4 x i32> @test_xor(i32 %addr_i, <4 x i32> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x i32>*
+  %loaded = load <4 x i32>* %addr, align 4
+  %result = xor <4 x i32> %addend, %loaded
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: test_xor
+; CHECK-NOT: pxor xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: pxor xmm{{.}},
+
+define <4 x i32> @test_sub(i32 %addr_i, <4 x i32> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x i32>*
+  %loaded = load <4 x i32>* %addr, align 4
+  %result = sub <4 x i32> %addend, %loaded
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: test_sub
+; CHECK-NOT: psubd xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: psubd xmm{{.}},
+
+define <4 x float> @test_fadd(i32 %addr_i, <4 x float> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x float>*
+  %loaded = load <4 x float>* %addr, align 4
+  %result = fadd <4 x float> %addend, %loaded
+  ret <4 x float> %result
+}
+; CHECK-LABEL: test_fadd
+; CHECK-NOT: addps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: addps xmm{{.}},
+
+define <4 x float> @test_fsub(i32 %addr_i, <4 x float> %addend) {
+entry:
+  %addr = inttoptr i32 %addr_i to <4 x float>*
+  %loaded = load <4 x float>* %addr, align 4
+  %result = fsub <4 x float> %addend, %loaded
+  ret <4 x float> %result
+}
+; CHECK-LABEL: test_fsub
+; CHECK-NOT: subps xmm{{.}},XMMWORD PTR [e{{ax|cx|dx|di|si|bx|bp}}
+; CHECK: subps xmm{{.}},