third_party/llvm-7.0/llvm/test/CodeGen/X86/combine-multiplies.ll - SwiftShader - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s

 ; Source file looks something like this:
 ;
 ; typedef int AAA[100][100];
 ;
 ; void testCombineMultiplies(AAA a,int lll)
 ; {
 ;   int LOC = lll + 5;
 ;
 ;   a[LOC][LOC] = 11;
 ;
 ;   a[LOC][20] = 22;
 ;   a[LOC+20][20] = 33;
 ; }
 ;
 ; We want to make sure we don't generate 2 multiply instructions,
 ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
 ; should combine the instructions in such a way to avoid the extra
 ; multiply.
 ;
 ; Output looks roughly like this:
 ;
 ;	movl	8(%esp), %eax
 ;	movl	12(%esp), %ecx
 ;	imull	$400, %ecx, %edx        # imm = 0x190
 ;	leal	(%edx,%eax), %esi
 ;	movl	$11, 2020(%esi,%ecx,4)
 ;	movl	$22, 2080(%edx,%eax)
 ;	movl	$33, 10080(%edx,%eax)

 ; Function Attrs: nounwind
 define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
 ; CHECK-LABEL: testCombineMultiplies:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
 ; CHECK-NEXT:    leal (%eax,%edx), %esi
 ; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
 ; CHECK-NEXT:    movl $22, 2080(%eax,%edx)
 ; CHECK-NEXT:    movl $33, 10080(%eax,%edx)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
   %add = add nsw i32 %lll, 5
   %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
   store i32 11, i32* %arrayidx1, align 4
   %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
   store i32 22, i32* %arrayidx3, align 4
   %add4 = add nsw i32 %lll, 25
   %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
   store i32 33, i32* %arrayidx6, align 4
   ret void
 }


 ; Test for the same optimization on vector multiplies.
 ;
 ; Source looks something like this:
 ;
 ; typedef int v4int __attribute__((__vector_size__(16)));
 ;
 ; v4int x;
 ; v4int v2, v3;
 ; void testCombineMultiplies_splat(v4int v1) {
 ;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
 ;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
 ;   x = (v1 + (v4int){ 11, 11, 11, 11 });
 ; }
 ;
 ; Output looks something like this:
 ;
 ; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat
 ; # %bb.0:                                 # %entry
 ; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11]
 ; 	paddd	%xmm0, %xmm1
 ; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22]
 ; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3]
 ; 	pmuludq	%xmm2, %xmm0
 ; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3]
 ; 	pmuludq	%xmm2, %xmm3
 ; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3]
 ; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242]
 ;	paddd	%xmm0, %xmm2
 ;	paddd	.LCPI1_3, %xmm0
 ;	movdqa	%xmm2, v2
 ;	movdqa	%xmm0, v3
 ;	movdqa	%xmm1, x
 ;	retl
 ;
 ; Again, we want to make sure we don't generate two different multiplies.
 ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
 ; pmuludq instructions), followed by two adds. Without this optimization, we'd
 ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).

 @v2 = common global <4 x i32> zeroinitializer, align 16
 @v3 = common global <4 x i32> zeroinitializer, align 16
 @x = common global <4 x i32> zeroinitializer, align 16

 ; Function Attrs: nounwind
 define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
 ; CHECK-LABEL: testCombineMultiplies_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pmuludq %xmm2, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    pmuludq %xmm2, %xmm3
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
 ; CHECK-NEXT:    paddd %xmm0, %xmm2
 ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
 ; CHECK-NEXT:    movdqa %xmm2, v2
 ; CHECK-NEXT:    movdqa %xmm0, v3
 ; CHECK-NEXT:    movdqa %xmm1, x
 ; CHECK-NEXT:    retl
 entry:
   %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
   %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
   %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
   %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
   store <4 x i32> %add1, <4 x i32>* @x, align 16
   ret void
 }

 ; Finally, check the non-splatted vector case. This is very similar
 ; to the previous test case, except for the vector values.

 ; Function Attrs: nounwind
 define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
 ; CHECK-LABEL: testCombineMultiplies_non_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pmuludq %xmm2, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-NEXT:    pmuludq %xmm3, %xmm2
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
 ; CHECK-NEXT:    paddd %xmm0, %xmm2
 ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0
 ; CHECK-NEXT:    movdqa %xmm2, v2
 ; CHECK-NEXT:    movdqa %xmm0, v3
 ; CHECK-NEXT:    movdqa %xmm1, x
 ; CHECK-NEXT:    retl
 entry:
   %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
   %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
   %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
   %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
   store <4 x i32> %mul1, <4 x i32>* @v2, align 16
   store <4 x i32> %mul2, <4 x i32>* @v3, align 16
   store <4 x i32> %add1, <4 x i32>* @x, align 16
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 \| FileCheck %s

	; Source file looks something like this:
	;
	; typedef int AAA[100][100];
	;
	; void testCombineMultiplies(AAA a,int lll)
	; {
	; int LOC = lll + 5;
	;
	; a[LOC][LOC] = 11;
	;
	; a[LOC][20] = 22;
	; a[LOC+20][20] = 33;
	; }
	;
	; We want to make sure we don't generate 2 multiply instructions,
	; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp
	; should combine the instructions in such a way to avoid the extra
	; multiply.
	;
	; Output looks roughly like this:
	;
	; movl 8(%esp), %eax
	; movl 12(%esp), %ecx
	; imull $400, %ecx, %edx # imm = 0x190
	; leal (%edx,%eax), %esi
	; movl $11, 2020(%esi,%ecx,4)
	; movl $22, 2080(%edx,%eax)
	; movl $33, 10080(%edx,%eax)

	; Function Attrs: nounwind
	define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
	; CHECK-LABEL: testCombineMultiplies:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: pushl %esi
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190
	; CHECK-NEXT: leal (%eax,%edx), %esi
	; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4)
	; CHECK-NEXT: movl $22, 2080(%eax,%edx)
	; CHECK-NEXT: movl $33, 10080(%eax,%edx)
	; CHECK-NEXT: popl %esi
	; CHECK-NEXT: retl
	entry:
	%add = add nsw i32 %lll, 5
	%arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add
	store i32 11, i32* %arrayidx1, align 4
	%arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20
	store i32 22, i32* %arrayidx3, align 4
	%add4 = add nsw i32 %lll, 25
	%arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20
	store i32 33, i32* %arrayidx6, align 4
	ret void
	}


	; Test for the same optimization on vector multiplies.
	;
	; Source looks something like this:
	;
	; typedef int v4int __attribute__((__vector_size__(16)));
	;
	; v4int x;
	; v4int v2, v3;
	; void testCombineMultiplies_splat(v4int v1) {
	; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22};
	; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22};
	; x = (v1 + (v4int){ 11, 11, 11, 11 });
	; }
	;
	; Output looks something like this:
	;
	; testCombineMultiplies_splat: # @testCombineMultiplies_splat
	; # %bb.0: # %entry
	; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]
	; paddd %xmm0, %xmm1
	; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]
	; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3]
	; pmuludq %xmm2, %xmm0
	; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
	; pmuludq %xmm2, %xmm3
	; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3]
	; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242]
	; paddd %xmm0, %xmm2
	; paddd .LCPI1_3, %xmm0
	; movdqa %xmm2, v2
	; movdqa %xmm0, v3
	; movdqa %xmm1, x
	; retl
	;
	; Again, we want to make sure we don't generate two different multiplies.
	; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two
	; pmuludq instructions), followed by two adds. Without this optimization, we'd
	; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions).

	@v2 = common global <4 x i32> zeroinitializer, align 16
	@v3 = common global <4 x i32> zeroinitializer, align 16
	@x = common global <4 x i32> zeroinitializer, align 16

	; Function Attrs: nounwind
	define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
	; CHECK-LABEL: testCombineMultiplies_splat:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]
	; CHECK-NEXT: paddd %xmm0, %xmm1
	; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]
	; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
	; CHECK-NEXT: pmuludq %xmm2, %xmm0
	; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; CHECK-NEXT: pmuludq %xmm2, %xmm3
	; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
	; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242]
	; CHECK-NEXT: paddd %xmm0, %xmm2
	; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0
	; CHECK-NEXT: movdqa %xmm2, v2
	; CHECK-NEXT: movdqa %xmm0, v3
	; CHECK-NEXT: movdqa %xmm1, x
	; CHECK-NEXT: retl
	entry:
	%add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
	%mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22>
	%add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33>
	%mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22>
	store <4 x i32> %mul1, <4 x i32>* @v2, align 16
	store <4 x i32> %mul2, <4 x i32>* @v3, align 16
	store <4 x i32> %add1, <4 x i32>* @x, align 16
	ret void
	}

	; Finally, check the non-splatted vector case. This is very similar
	; to the previous test case, except for the vector values.

	; Function Attrs: nounwind
	define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
	; CHECK-LABEL: testCombineMultiplies_non_splat:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
	; CHECK-NEXT: paddd %xmm0, %xmm1
	; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]
	; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
	; CHECK-NEXT: pmuludq %xmm2, %xmm0
	; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; CHECK-NEXT: pmuludq %xmm3, %xmm2
	; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
	; CHECK-NEXT: paddd %xmm0, %xmm2
	; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0
	; CHECK-NEXT: movdqa %xmm2, v2
	; CHECK-NEXT: movdqa %xmm0, v3
	; CHECK-NEXT: movdqa %xmm1, x
	; CHECK-NEXT: retl
	entry:
	%add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
	%mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55>
	%add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66>
	%mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55>
	store <4 x i32> %mul1, <4 x i32>* @v2, align 16
	store <4 x i32> %mul2, <4 x i32>* @v3, align 16
	store <4 x i32> %add1, <4 x i32>* @x, align 16
	ret void
	}