| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s | 
 |  | 
 | ; Source file looks something like this: | 
 | ; | 
 | ; typedef int AAA[100][100]; | 
 | ; | 
 | ; void testCombineMultiplies(AAA a,int lll) | 
 | ; { | 
 | ;   int LOC = lll + 5; | 
 | ; | 
 | ;   a[LOC][LOC] = 11; | 
 | ; | 
 | ;   a[LOC][20] = 22; | 
 | ;   a[LOC+20][20] = 33; | 
 | ; } | 
 | ; | 
 | ; We want to make sure we don't generate 2 multiply instructions, | 
 | ; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp | 
 | ; should combine the instructions in such a way to avoid the extra | 
 | ; multiply. | 
 | ; | 
 | ; Output looks roughly like this: | 
 | ; | 
 | ;	movl	8(%esp), %eax | 
 | ;	movl	12(%esp), %ecx | 
 | ;	imull	$400, %ecx, %edx        # imm = 0x190 | 
 | ;	leal	(%edx,%eax), %esi | 
 | ;	movl	$11, 2020(%esi,%ecx,4) | 
 | ;	movl	$22, 2080(%edx,%eax) | 
 | ;	movl	$33, 10080(%edx,%eax) | 
 |  | 
 | ; Function Attrs: nounwind | 
 | define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind { | 
 | ; CHECK-LABEL: testCombineMultiplies: | 
 | ; CHECK:       # %bb.0: # %entry | 
 | ; CHECK-NEXT:    pushl %esi | 
 | ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax | 
 | ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx | 
 | ; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190 | 
 | ; CHECK-NEXT:    leal (%eax,%edx), %esi | 
 | ; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4) | 
 | ; CHECK-NEXT:    movl $22, 2080(%eax,%edx) | 
 | ; CHECK-NEXT:    movl $33, 10080(%eax,%edx) | 
 | ; CHECK-NEXT:    popl %esi | 
 | ; CHECK-NEXT:    retl | 
 | entry: | 
 |   %add = add nsw i32 %lll, 5 | 
 |   %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add | 
 |   store i32 11, i32* %arrayidx1, align 4 | 
 |   %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 | 
 |   store i32 22, i32* %arrayidx3, align 4 | 
 |   %add4 = add nsw i32 %lll, 25 | 
 |   %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 | 
 |   store i32 33, i32* %arrayidx6, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | ; Test for the same optimization on vector multiplies. | 
 | ; | 
 | ; Source looks something like this: | 
 | ; | 
 | ; typedef int v4int __attribute__((__vector_size__(16))); | 
 | ; | 
 | ; v4int x; | 
 | ; v4int v2, v3; | 
 | ; void testCombineMultiplies_splat(v4int v1) { | 
 | ;   v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22}; | 
 | ;   v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22}; | 
 | ;   x = (v1 + (v4int){ 11, 11, 11, 11 }); | 
 | ; } | 
 | ; | 
 | ; Output looks something like this: | 
 | ; | 
 | ; testCombineMultiplies_splat:                              # @testCombineMultiplies_splat | 
 | ; # %bb.0:                                 # %entry | 
 | ; 	movdqa	.LCPI1_0, %xmm1         # xmm1 = [11,11,11,11] | 
 | ; 	paddd	%xmm0, %xmm1 | 
 | ; 	movdqa	.LCPI1_1, %xmm2         # xmm2 = [22,22,22,22] | 
 | ; 	pshufd	$245, %xmm0, %xmm3      # xmm3 = xmm0[1,1,3,3] | 
 | ; 	pmuludq	%xmm2, %xmm0 | 
 | ; 	pshufd	$232, %xmm0, %xmm0      # xmm0 = xmm0[0,2,2,3] | 
 | ; 	pmuludq	%xmm2, %xmm3 | 
 | ; 	pshufd	$232, %xmm3, %xmm2      # xmm2 = xmm3[0,2,2,3] | 
 | ; 	punpckldq	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] | 
 | ; 	movdqa	.LCPI1_2, %xmm2         # xmm2 = [242,242,242,242] | 
 | ;	paddd	%xmm0, %xmm2 | 
 | ;	paddd	.LCPI1_3, %xmm0 | 
 | ;	movdqa	%xmm2, v2 | 
 | ;	movdqa	%xmm0, v3 | 
 | ;	movdqa	%xmm1, x | 
 | ;	retl | 
 | ; | 
 | ; Again, we want to make sure we don't generate two different multiplies. | 
 | ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two | 
 | ; pmuludq instructions), followed by two adds. Without this optimization, we'd | 
 | ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). | 
 |  | 
 | @v2 = common global <4 x i32> zeroinitializer, align 16 | 
 | @v3 = common global <4 x i32> zeroinitializer, align 16 | 
 | @x = common global <4 x i32> zeroinitializer, align 16 | 
 |  | 
 | ; Function Attrs: nounwind | 
 | define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { | 
 | ; CHECK-LABEL: testCombineMultiplies_splat: | 
 | ; CHECK:       # %bb.0: # %entry | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11] | 
 | ; CHECK-NEXT:    paddd %xmm0, %xmm1 | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22] | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] | 
 | ; CHECK-NEXT:    pmuludq %xmm2, %xmm0 | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] | 
 | ; CHECK-NEXT:    pmuludq %xmm2, %xmm3 | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] | 
 | ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242] | 
 | ; CHECK-NEXT:    paddd %xmm0, %xmm2 | 
 | ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0 | 
 | ; CHECK-NEXT:    movdqa %xmm2, v2 | 
 | ; CHECK-NEXT:    movdqa %xmm0, v3 | 
 | ; CHECK-NEXT:    movdqa %xmm1, x | 
 | ; CHECK-NEXT:    retl | 
 | entry: | 
 |   %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11> | 
 |   %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22> | 
 |   %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33> | 
 |   %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22> | 
 |   store <4 x i32> %mul1, <4 x i32>* @v2, align 16 | 
 |   store <4 x i32> %mul2, <4 x i32>* @v3, align 16 | 
 |   store <4 x i32> %add1, <4 x i32>* @x, align 16 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Finally, check the non-splatted vector case. This is very similar | 
 | ; to the previous test case, except for the vector values. | 
 |  | 
 | ; Function Attrs: nounwind | 
 | define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { | 
 | ; CHECK-LABEL: testCombineMultiplies_non_splat: | 
 | ; CHECK:       # %bb.0: # %entry | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44] | 
 | ; CHECK-NEXT:    paddd %xmm0, %xmm1 | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,33,44,55] | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] | 
 | ; CHECK-NEXT:    pmuludq %xmm2, %xmm0 | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] | 
 | ; CHECK-NEXT:    pmuludq %xmm3, %xmm2 | 
 | ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] | 
 | ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] | 
 | ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420] | 
 | ; CHECK-NEXT:    paddd %xmm0, %xmm2 | 
 | ; CHECK-NEXT:    paddd {{\.LCPI.*}}, %xmm0 | 
 | ; CHECK-NEXT:    movdqa %xmm2, v2 | 
 | ; CHECK-NEXT:    movdqa %xmm0, v3 | 
 | ; CHECK-NEXT:    movdqa %xmm1, x | 
 | ; CHECK-NEXT:    retl | 
 | entry: | 
 |   %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44> | 
 |   %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55> | 
 |   %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66> | 
 |   %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55> | 
 |   store <4 x i32> %mul1, <4 x i32>* @v2, align 16 | 
 |   store <4 x i32> %mul2, <4 x i32>* @v3, align 16 | 
 |   store <4 x i32> %add1, <4 x i32>* @x, align 16 | 
 |   ret void | 
 | } |