Handle GPR and vector shift ops. Handle pmull also.

For the integer shift ops, since the Src1 operand is forced
to be an immediate or register (cl), it should be legal to
have Dest+Src0 be either register or memory. However, we
are currently only using the register form. It might be the
case that shift w/ Dest+Src0 as mem are less optimized
on some micro-architectures though, since it has to load,
shift, and store all in one operation, but I'm not sure.

BUG=none
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/622113002
diff --git a/tests_lit/llvm2ice_tests/address-mode-opt.ll b/tests_lit/llvm2ice_tests/address-mode-opt.ll
index ba42d65..68538e0 100644
--- a/tests_lit/llvm2ice_tests/address-mode-opt.ll
+++ b/tests_lit/llvm2ice_tests/address-mode-opt.ll
@@ -3,6 +3,10 @@
 ; RUN: %p2i -i %s --args -O2 --verbose none \
 ; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
 ; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args -O2 -mattr=sse4.1 --verbose none \
+; RUN:   | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN:   | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN:   | FileCheck --check-prefix=SSE41 %s
 ; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
 
 define float @load_arg_plus_200000(float* %arg) {
@@ -49,6 +53,32 @@
 ; CHECK: movss xmm0, dword ptr [e{{..}}]
 }
 
+define <8 x i16> @load_mul_v8i16_mem(<8 x i16> %arg0, i32 %arg1_iptr) {
+entry:
+  %addr_sub = sub i32 %arg1_iptr, 200000
+  %addr_ptr = inttoptr i32 %addr_sub to <8 x i16>*
+  %arg1 = load <8 x i16>* %addr_ptr, align 2
+  %res_vec = mul <8 x i16> %arg0, %arg1
+  ret <8 x i16> %res_vec
+; CHECK-LABEL: load_mul_v8i16_mem:
+; CHECK: pmullw xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
+define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
+entry:
+  %addr_sub = sub i32 %arg1_iptr, 200000
+  %addr_ptr = inttoptr i32 %addr_sub to <4 x i32>*
+  %arg1 = load <4 x i32>* %addr_ptr, align 4
+  %res = mul <4 x i32> %arg0, %arg1
+  ret <4 x i32> %res
+; CHECK-LABEL: load_mul_v4i32_mem:
+; CHECK: pmuludq xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+; CHECK: pmuludq
+;
+; SSE41-LABEL: load_mul_v4i32_mem:
+; SSE41: pmulld xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
 define float @address_mode_opt_chaining(float* %arg) {
 entry:
   %arg.int = ptrtoint float* %arg to i32