Handle GPR and vector shift ops. Handle pmull also.
For the integer shift ops, since the Src1 operand is forced
to be an immediate or register (cl), it should be legal to
have Dest+Src0 be either register or memory. However, we
are currently only using the register form. It might be the
case that shift w/ Dest+Src0 as mem are less optimized
on some micro-architectures though, since it has to load,
shift, and store all in one operation, but I'm not sure.
BUG=none
R=stichnot@chromium.org
Review URL: https://codereview.chromium.org/622113002
diff --git a/tests_lit/assembler/x86/immediate_encodings.ll b/tests_lit/assembler/x86/immediate_encodings.ll
index ff23e55..d3c1ac2 100644
--- a/tests_lit/assembler/x86/immediate_encodings.ll
+++ b/tests_lit/assembler/x86/immediate_encodings.ll
@@ -254,5 +254,27 @@
; CHECK-LABEL: testMul32Imm16Neg
; CHECK: 69 c0 00 ff ff ff imul eax, eax, 4294967040
+; The GPR shift instructions either allow an 8-bit immediate or
+; have a special encoding for "1".
+define internal i32 @testShl16Imm8(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = shl i16 %arg_i16, 13
+ %result = zext i16 %tmp to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testShl16Imm8
+; CHECK: 66 c1 e0 0d shl ax, 13
+
+define internal i32 @testShl16Imm1(i32 %arg) {
+entry:
+ %arg_i16 = trunc i32 %arg to i16
+ %tmp = shl i16 %arg_i16, 1
+ %result = zext i16 %tmp to i32
+ ret i32 %result
+}
+; CHECK-LABEL: testShl16Imm1
+; CHECK: 66 d1 e0 shl ax
+
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
diff --git a/tests_lit/assembler/x86/opcode_register_encodings.ll b/tests_lit/assembler/x86/opcode_register_encodings.ll
new file mode 100644
index 0000000..3d2b266
--- /dev/null
+++ b/tests_lit/assembler/x86/opcode_register_encodings.ll
@@ -0,0 +1,87 @@
+; Tests various aspects of x86 opcode encodings. E.g., some opcodes like
+; those for pmull vary more wildly depending on operand size (rather than
+; follow a usual pattern).
+
+; RUN: %p2i -i %s --args -O2 -mattr=sse4.1 --verbose none \
+; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
+; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
+
+define <8 x i16> @test_mul_v8i16(<8 x i16> %arg0, <8 x i16> %arg1) {
+entry:
+ %res = mul <8 x i16> %arg0, %arg1
+ ret <8 x i16> %res
+; CHECK-LABEL: test_mul_v8i16
+; CHECK: 66 0f d5 c1 pmullw xmm0, xmm1
+}
+
+; Test register and address mode encoding.
+define <8 x i16> @test_mul_v8i16_more_regs(<8 x i1> %cond, <8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2, <8 x i16> %arg3, <8 x i16> %arg4, <8 x i16> %arg5, <8 x i16> %arg6, <8 x i16> %arg7, <8 x i16> %arg8) {
+entry:
+ %res1 = mul <8 x i16> %arg0, %arg1
+ %res2 = mul <8 x i16> %arg0, %arg2
+ %res3 = mul <8 x i16> %arg0, %arg3
+ %res4 = mul <8 x i16> %arg0, %arg4
+ %res5 = mul <8 x i16> %arg0, %arg5
+ %res6 = mul <8 x i16> %arg0, %arg6
+ %res7 = mul <8 x i16> %arg0, %arg7
+ %res8 = mul <8 x i16> %arg0, %arg8
+ %res_acc1 = select <8 x i1> %cond, <8 x i16> %res1, <8 x i16> %res2
+ %res_acc2 = select <8 x i1> %cond, <8 x i16> %res3, <8 x i16> %res4
+ %res_acc3 = select <8 x i1> %cond, <8 x i16> %res5, <8 x i16> %res6
+ %res_acc4 = select <8 x i1> %cond, <8 x i16> %res7, <8 x i16> %res8
+ %res_acc1_3 = select <8 x i1> %cond, <8 x i16> %res_acc1, <8 x i16> %res_acc3
+ %res_acc2_4 = select <8 x i1> %cond, <8 x i16> %res_acc2, <8 x i16> %res_acc4
+ %res = select <8 x i1> %cond, <8 x i16> %res_acc1_3, <8 x i16> %res_acc2_4
+ ret <8 x i16> %res
+; CHECK-LABEL: test_mul_v8i16_more_regs
+; CHECK-DAG: 66 0f d5 c2 pmullw xmm0, xmm2
+; CHECK-DAG: 66 0f d5 c3 pmullw xmm0, xmm3
+; CHECK-DAG: 66 0f d5 c4 pmullw xmm0, xmm4
+; CHECK-DAG: 66 0f d5 c5 pmullw xmm0, xmm5
+; CHECK-DAG: 66 0f d5 c6 pmullw xmm0, xmm6
+; CHECK-DAG: 66 0f d5 c7 pmullw xmm0, xmm7
+; CHECK-DAG: 66 0f d5 44 24 70 pmullw xmm0, xmmword ptr [esp + 112]
+; CHECK-DAG: 66 0f d5 8c 24 80 00 00 00 pmullw xmm1, xmmword ptr [esp + 128]
+}
+
+define <4 x i32> @test_mul_v4i32(<4 x i32> %arg0, <4 x i32> %arg1) {
+entry:
+ %res = mul <4 x i32> %arg0, %arg1
+ ret <4 x i32> %res
+; CHECK-LABEL: test_mul_v4i32
+; CHECK: 66 0f 38 40 c1 pmulld xmm0, xmm1
+}
+
+define <4 x i32> @test_mul_v4i32_more_regs(<4 x i1> %cond, <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, <4 x i32> %arg4, <4 x i32> %arg5, <4 x i32> %arg6, <4 x i32> %arg7, <4 x i32> %arg8) {
+entry:
+ %res1 = mul <4 x i32> %arg0, %arg1
+ %res2 = mul <4 x i32> %arg0, %arg2
+ %res3 = mul <4 x i32> %arg0, %arg3
+ %res4 = mul <4 x i32> %arg0, %arg4
+ %res5 = mul <4 x i32> %arg0, %arg5
+ %res6 = mul <4 x i32> %arg0, %arg6
+ %res7 = mul <4 x i32> %arg0, %arg7
+ %res8 = mul <4 x i32> %arg0, %arg8
+ %res_acc1 = select <4 x i1> %cond, <4 x i32> %res1, <4 x i32> %res2
+ %res_acc2 = select <4 x i1> %cond, <4 x i32> %res3, <4 x i32> %res4
+ %res_acc3 = select <4 x i1> %cond, <4 x i32> %res5, <4 x i32> %res6
+ %res_acc4 = select <4 x i1> %cond, <4 x i32> %res7, <4 x i32> %res8
+ %res_acc1_3 = select <4 x i1> %cond, <4 x i32> %res_acc1, <4 x i32> %res_acc3
+ %res_acc2_4 = select <4 x i1> %cond, <4 x i32> %res_acc2, <4 x i32> %res_acc4
+ %res = select <4 x i1> %cond, <4 x i32> %res_acc1_3, <4 x i32> %res_acc2_4
+ ret <4 x i32> %res
+; CHECK-LABEL: test_mul_v4i32_more_regs
+; CHECK-DAG: 66 0f 38 40 c2 pmulld xmm0, xmm2
+; CHECK-DAG: 66 0f 38 40 c3 pmulld xmm0, xmm3
+; CHECK-DAG: 66 0f 38 40 c4 pmulld xmm0, xmm4
+; CHECK-DAG: 66 0f 38 40 c5 pmulld xmm0, xmm5
+; CHECK-DAG: 66 0f 38 40 c6 pmulld xmm0, xmm6
+; CHECK-DAG: 66 0f 38 40 c7 pmulld xmm0, xmm7
+; CHECK-DAG: 66 0f 38 40 44 24 70 pmulld xmm0, xmmword ptr [esp + 112]
+; CHECK-DAG: 66 0f 38 40 8c 24 80 00 00 00 pmulld xmm1, xmmword ptr [esp + 128]
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
index 2307712..0f0e35b 100644
--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -159,6 +159,69 @@
; CHECK-LABEL: srem8BitConst
; CHECK: idiv {{[abcd]l|byte ptr}}
+define internal i32 @shl8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %shl = shl i8 %b_8, %a_8
+ %ret = zext i8 %shl to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: shl8Bit
+; CHECK: shl {{[abd]l|byte ptr}}, cl
+
+define internal i32 @shl8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %shl = shl i8 %a_8, 6
+ %ret = zext i8 %shl to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: shl8BitConst
+; CHECK: shl {{[abcd]l|byte ptr}}, 6
+
+define internal i32 @lshr8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %lshr = lshr i8 %b_8, %a_8
+ %ret = zext i8 %lshr to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: lshr8Bit
+; CHECK: shr {{[abd]l|byte ptr}}, cl
+
+define internal i32 @lshr8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %lshr = lshr i8 %a_8, 6
+ %ret = zext i8 %lshr to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: lshr8BitConst
+; CHECK: shr {{[abcd]l|byte ptr}}, 6
+
+define internal i32 @ashr8Bit(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %b_8 = trunc i32 %b to i8
+ %ashr = ashr i8 %b_8, %a_8
+ %ret = zext i8 %ashr to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: ashr8Bit
+; CHECK: sar {{[abd]l|byte ptr}}, cl
+
+define internal i32 @ashr8BitConst(i32 %a, i32 %b) {
+entry:
+ %a_8 = trunc i32 %a to i8
+ %ashr = ashr i8 %a_8, 6
+ %ret = zext i8 %ashr to i32
+ ret i32 %ret
+}
+; CHECK-LABEL: ashr8BitConst
+; CHECK: sar {{[abcd]l|byte ptr}}, 6
+
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/address-mode-opt.ll b/tests_lit/llvm2ice_tests/address-mode-opt.ll
index ba42d65..68538e0 100644
--- a/tests_lit/llvm2ice_tests/address-mode-opt.ll
+++ b/tests_lit/llvm2ice_tests/address-mode-opt.ll
@@ -3,6 +3,10 @@
; RUN: %p2i -i %s --args -O2 --verbose none \
; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - | FileCheck %s
+; RUN: %p2i -i %s --args -O2 -mattr=sse4.1 --verbose none \
+; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
+; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
+; RUN: | FileCheck --check-prefix=SSE41 %s
; RUN: %p2i -i %s --args --verbose none | FileCheck --check-prefix=ERRORS %s
define float @load_arg_plus_200000(float* %arg) {
@@ -49,6 +53,32 @@
; CHECK: movss xmm0, dword ptr [e{{..}}]
}
+define <8 x i16> @load_mul_v8i16_mem(<8 x i16> %arg0, i32 %arg1_iptr) {
+entry:
+ %addr_sub = sub i32 %arg1_iptr, 200000
+ %addr_ptr = inttoptr i32 %addr_sub to <8 x i16>*
+ %arg1 = load <8 x i16>* %addr_ptr, align 2
+ %res_vec = mul <8 x i16> %arg0, %arg1
+ ret <8 x i16> %res_vec
+; CHECK-LABEL: load_mul_v8i16_mem:
+; CHECK: pmullw xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
+define <4 x i32> @load_mul_v4i32_mem(<4 x i32> %arg0, i32 %arg1_iptr) {
+entry:
+ %addr_sub = sub i32 %arg1_iptr, 200000
+ %addr_ptr = inttoptr i32 %addr_sub to <4 x i32>*
+ %arg1 = load <4 x i32>* %addr_ptr, align 4
+ %res = mul <4 x i32> %arg0, %arg1
+ ret <4 x i32> %res
+; CHECK-LABEL: load_mul_v4i32_mem:
+; CHECK: pmuludq xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+; CHECK: pmuludq
+;
+; SSE41-LABEL: load_mul_v4i32_mem:
+; SSE41: pmulld xmm{{.*}}, xmmword ptr [e{{.*}} - 200000]
+}
+
define float @address_mode_opt_chaining(float* %arg) {
entry:
%arg.int = ptrtoint float* %arg to i32
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
index e8bbb30..f1fc459 100644
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -326,7 +326,9 @@
ret i32 %r_zext
}
; CHECK-LABEL: test_bswap_16
-; CHECK: rol {{.*}}, 8
+; Make sure this is the right operand size so that the most significant bit
+; to least significant bit rotation happens at the right boundary.
+; CHECK: rol {{[abcd]x|si|di|bp|word ptr}}, 8
define i32 @test_bswap_32(i32 %x) {
entry:
diff --git a/tests_lit/llvm2ice_tests/vector-arith.ll b/tests_lit/llvm2ice_tests/vector-arith.ll
index 29bb57f..bae62cd 100644
--- a/tests_lit/llvm2ice_tests/vector-arith.ll
+++ b/tests_lit/llvm2ice_tests/vector-arith.ll
@@ -21,7 +21,6 @@
; RUN: | llvm-objdump -d --symbolize -x86-asm-syntax=intel - \
; RUN: | FileCheck --check-prefix=SSE41 %s
; RUN: %p2i -i %s -a --verbose none | FileCheck --check-prefix=ERRORS %s
-; RUN: %p2i -i %s --insts | %szdiff %s | FileCheck --check-prefix=DUMP %s
define <4 x float> @test_fadd(<4 x float> %arg0, <4 x float> %arg1) {
entry: