third_party/llvm-7.0/llvm/test/CodeGen/X86/merge-store-constants.ll - SwiftShader.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=avx | FileCheck %s --check-prefix=X32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64

 define void @big_nonzero_16_bytes(i32* nocapture %a) {
 ; X32-LABEL: big_nonzero_16_bytes:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_16_bytes:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
 ; X64-NEXT:    vmovups %xmm0, (%rdi)
 ; X64-NEXT:    retq
   %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
   %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
   %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3

   store i32 1, i32* %a, align 4
   store i32 2, i32* %arrayidx1, align 4
   store i32 3, i32* %arrayidx2, align 4
   store i32 4, i32* %arrayidx3, align 4
   ret void
 }

 ; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
 ; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
 ; it takes extra instructions to do this in scalar.

 define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
 ; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,3]
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $4294967297, %rax # imm = 0x100000001
 ; X64-NEXT:    movq %rax, (%rdi)
 ; X64-NEXT:    movabsq $12884901889, %rax # imm = 0x300000001
 ; X64-NEXT:    movq %rax, 8(%rdi)
 ; X64-NEXT:    retq
   %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1

   store i64 4294967297, i64* %a
   store i64 12884901889, i64* %arrayidx1
   ret void
 }

 ; Splats may be an opportunity to use a broadcast op.

 define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
 ; X32-LABEL: big_nonzero_32_bytes_splat:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_32_bytes_splat:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
   %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
   %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3
   %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 4
   %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 5
   %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 6
   %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 7

   store i32 42, i32* %a, align 4
   store i32 42, i32* %arrayidx1, align 4
   store i32 42, i32* %arrayidx2, align 4
   store i32 42, i32* %arrayidx3, align 4
   store i32 42, i32* %arrayidx4, align 4
   store i32 42, i32* %arrayidx5, align 4
   store i32 42, i32* %arrayidx6, align 4
   store i32 42, i32* %arrayidx7, align 4
   ret void
 }

 ; Verify that we choose the best-sized store(s) for each chunk.

 define void @big_nonzero_63_bytes(i8* nocapture %a) {
 ; X32-LABEL: big_nonzero_63_bytes:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
 ; X32-NEXT:    vmovups %xmm0, 32(%eax)
 ; X32-NEXT:    movl $0, 52(%eax)
 ; X32-NEXT:    movl $7, 48(%eax)
 ; X32-NEXT:    movl $8, 56(%eax)
 ; X32-NEXT:    movw $9, 60(%eax)
 ; X32-NEXT:    movb $10, 62(%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: big_nonzero_63_bytes:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,3,4]
 ; X64-NEXT:    vmovups %ymm0, (%rdi)
 ; X64-NEXT:    movq $5, 32(%rdi)
 ; X64-NEXT:    movq $6, 40(%rdi)
 ; X64-NEXT:    movq $7, 48(%rdi)
 ; X64-NEXT:    movl $8, 56(%rdi)
 ; X64-NEXT:    movw $9, 60(%rdi)
 ; X64-NEXT:    movb $10, 62(%rdi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %a8 = bitcast i8* %a to i64*
   %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1
   %arrayidx16 = getelementptr inbounds i64, i64* %a8, i64 2
   %arrayidx24 = getelementptr inbounds i64, i64* %a8, i64 3
   %arrayidx32 = getelementptr inbounds i64, i64* %a8, i64 4
   %arrayidx40 = getelementptr inbounds i64, i64* %a8, i64 5
   %arrayidx48 = getelementptr inbounds i64, i64* %a8, i64 6
   %a4 = bitcast i8* %a to i32*
   %arrayidx56 = getelementptr inbounds i32, i32* %a4, i64 14
   %a2 = bitcast i8* %a to i16*
   %arrayidx60 = getelementptr inbounds i16, i16* %a2, i64 30
   %arrayidx62 = getelementptr inbounds i8, i8* %a, i64 62

   store i64 1, i64* %a8
   store i64 2, i64* %arrayidx8
   store i64 3, i64* %arrayidx16
   store i64 4, i64* %arrayidx24
   store i64 5, i64* %arrayidx32
   store i64 6, i64* %arrayidx40
   store i64 7, i64* %arrayidx48
   store i32 8, i32* %arrayidx56
   store i16 9, i16* %arrayidx60
   store i8 10, i8* %arrayidx62
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=X32
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx \| FileCheck %s --check-prefix=X64

	define void @big_nonzero_16_bytes(i32* nocapture %a) {
	; X32-LABEL: big_nonzero_16_bytes:
	; X32: # %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
	; X32-NEXT: vmovups %xmm0, (%eax)
	; X32-NEXT: retl
	;
	; X64-LABEL: big_nonzero_16_bytes:
	; X64: # %bb.0:
	; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
	; X64-NEXT: vmovups %xmm0, (%rdi)
	; X64-NEXT: retq
	%arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
	%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
	%arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3

	store i32 1, i32* %a, align 4
	store i32 2, i32* %arrayidx1, align 4
	store i32 3, i32* %arrayidx2, align 4
	store i32 4, i32* %arrayidx3, align 4
	ret void
	}

	; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
	; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
	; it takes extra instructions to do this in scalar.

	define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
	; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
	; X32: # %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3]
	; X32-NEXT: vmovups %xmm0, (%eax)
	; X32-NEXT: retl
	;
	; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
	; X64: # %bb.0:
	; X64-NEXT: movabsq $4294967297, %rax # imm = 0x100000001
	; X64-NEXT: movq %rax, (%rdi)
	; X64-NEXT: movabsq $12884901889, %rax # imm = 0x300000001
	; X64-NEXT: movq %rax, 8(%rdi)
	; X64-NEXT: retq
	%arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1

	store i64 4294967297, i64* %a
	store i64 12884901889, i64* %arrayidx1
	ret void
	}

	; Splats may be an opportunity to use a broadcast op.

	define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
	; X32-LABEL: big_nonzero_32_bytes_splat:
	; X32: # %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
	; X32-NEXT: vmovups %ymm0, (%eax)
	; X32-NEXT: vzeroupper
	; X32-NEXT: retl
	;
	; X64-LABEL: big_nonzero_32_bytes_splat:
	; X64: # %bb.0:
	; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
	; X64-NEXT: vmovups %ymm0, (%rdi)
	; X64-NEXT: vzeroupper
	; X64-NEXT: retq
	%arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
	%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
	%arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3
	%arrayidx4 = getelementptr inbounds i32, i32* %a, i64 4
	%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 5
	%arrayidx6 = getelementptr inbounds i32, i32* %a, i64 6
	%arrayidx7 = getelementptr inbounds i32, i32* %a, i64 7

	store i32 42, i32* %a, align 4
	store i32 42, i32* %arrayidx1, align 4
	store i32 42, i32* %arrayidx2, align 4
	store i32 42, i32* %arrayidx3, align 4
	store i32 42, i32* %arrayidx4, align 4
	store i32 42, i32* %arrayidx5, align 4
	store i32 42, i32* %arrayidx6, align 4
	store i32 42, i32* %arrayidx7, align 4
	ret void
	}

	; Verify that we choose the best-sized store(s) for each chunk.

	define void @big_nonzero_63_bytes(i8* nocapture %a) {
	; X32-LABEL: big_nonzero_63_bytes:
	; X32: # %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
	; X32-NEXT: vmovups %ymm0, (%eax)
	; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0]
	; X32-NEXT: vmovups %xmm0, 32(%eax)
	; X32-NEXT: movl $0, 52(%eax)
	; X32-NEXT: movl $7, 48(%eax)
	; X32-NEXT: movl $8, 56(%eax)
	; X32-NEXT: movw $9, 60(%eax)
	; X32-NEXT: movb $10, 62(%eax)
	; X32-NEXT: vzeroupper
	; X32-NEXT: retl
	;
	; X64-LABEL: big_nonzero_63_bytes:
	; X64: # %bb.0:
	; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4]
	; X64-NEXT: vmovups %ymm0, (%rdi)
	; X64-NEXT: movq $5, 32(%rdi)
	; X64-NEXT: movq $6, 40(%rdi)
	; X64-NEXT: movq $7, 48(%rdi)
	; X64-NEXT: movl $8, 56(%rdi)
	; X64-NEXT: movw $9, 60(%rdi)
	; X64-NEXT: movb $10, 62(%rdi)
	; X64-NEXT: vzeroupper
	; X64-NEXT: retq
	%a8 = bitcast i8* %a to i64*
	%arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1
	%arrayidx16 = getelementptr inbounds i64, i64* %a8, i64 2
	%arrayidx24 = getelementptr inbounds i64, i64* %a8, i64 3
	%arrayidx32 = getelementptr inbounds i64, i64* %a8, i64 4
	%arrayidx40 = getelementptr inbounds i64, i64* %a8, i64 5
	%arrayidx48 = getelementptr inbounds i64, i64* %a8, i64 6
	%a4 = bitcast i8* %a to i32*
	%arrayidx56 = getelementptr inbounds i32, i32* %a4, i64 14
	%a2 = bitcast i8* %a to i16*
	%arrayidx60 = getelementptr inbounds i16, i16* %a2, i64 30
	%arrayidx62 = getelementptr inbounds i8, i8* %a, i64 62

	store i64 1, i64* %a8
	store i64 2, i64* %arrayidx8
	store i64 3, i64* %arrayidx16
	store i64 4, i64* %arrayidx24
	store i64 5, i64* %arrayidx32
	store i64 6, i64* %arrayidx40
	store i64 7, i64* %arrayidx48
	store i32 8, i32* %arrayidx56
	store i16 9, i16* %arrayidx60
	store i8 10, i8* %arrayidx62
	ret void
	}