Optimize Int2 construction.

Change-Id: Ibab854164a45c998976e65b8bfec80a8a688461b
Reviewed-on: https://swiftshader-review.googlesource.com/4511
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Reactor/Nucleus.cpp b/src/Reactor/Nucleus.cpp
index 712dc7b..4305895 100644
--- a/src/Reactor/Nucleus.cpp
+++ b/src/Reactor/Nucleus.cpp
@@ -4623,13 +4623,23 @@
 
 	Int2::Int2(RValue<Int> lo, RValue<Int> hi)
 	{
-		Constant *shuffle[2];
-		shuffle[0] = Nucleus::createConstantInt(0);
-		shuffle[1] = Nucleus::createConstantInt(1);
-
-		Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, VectorType::get(Int::getType(), 1)), Nucleus::createBitCast(hi.value, VectorType::get(Int::getType(), 1)), Nucleus::createConstantVector(shuffle, 2));
-
-		storeValue(Nucleus::createBitCast(packed, Int2::getType()));
+		if(CPUID::supportsMMX2())
+		{
+			// movd mm0, lo
+			// movd mm1, hi
+			// punpckldq mm0, mm1
+			storeValue(As<Int2>(UnpackLow(As<Int2>(Long1(RValue<UInt>(lo))), As<Int2>(Long1(RValue<UInt>(hi))))).value);
+		}
+		else
+		{
+			Constant *shuffle[2];
+			shuffle[0] = Nucleus::createConstantInt(0);
+			shuffle[1] = Nucleus::createConstantInt(1);
+	
+			Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, VectorType::get(Int::getType(), 1)), Nucleus::createBitCast(hi.value, VectorType::get(Int::getType(), 1)), Nucleus::createConstantVector(shuffle, 2));
+	
+			storeValue(Nucleus::createBitCast(packed, Int2::getType()));
+		}
 	}
 
 	RValue<Int2> Int2::operator=(RValue<Int2> rhs) const
diff --git a/src/Shader/PixelRoutine.cpp b/src/Shader/PixelRoutine.cpp
index a0f876c..c313eb4 100644
--- a/src/Shader/PixelRoutine.cpp
+++ b/src/Shader/PixelRoutine.cpp
@@ -1013,14 +1013,14 @@
 		Short4 c01;
 		Short4 c23;
 		Pointer<Byte> buffer;
+		Pointer<Byte> buffer2;
 
 		switch(state.targetFormat[index])
 		{
 		case FORMAT_R5G6B5:
 			buffer = cBuffer + 2 * x;
-			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
-			buffer += *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
-			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
+			buffer2 = buffer + *Pointer<Int>(r.data + OFFSET(DrawData, colorPitchB[index]));
+			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
 
 			pixel.x = c01 & Short4(0xF800u);
 			pixel.y = (c01 & Short4(0x07E0u)) << 5;