Use unsigned index array accesses for texture sampling.

Array accesses with unsigned indices can be faster on x86-64 because
we can take advantage of implicit zero-extension of 32-bit integers to
64-bit during pointer arithmetic.

Change-Id: I17d531d9ad05c2d2994f007d5444b2a514a591b8
Reviewed-on: https://swiftshader-review.googlesource.com/8571
Reviewed-by: Nicolas Capens <capn@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index 18feac3..2a6766a 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -1674,7 +1674,7 @@
 		return As<Short4>(UShort4(tmp));
 	}
 
-	void SamplerCore::computeIndices(Int index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function)
+	void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function)
 	{
 		bool texelFetch = (function == Fetch);
 		bool hasOffset = (function.option == Offset);
@@ -1731,8 +1731,8 @@
 			{
 				size *= Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)));
 			}
-			Int min = Int(0);
-			Int max = size - Int(1);
+			UInt min = 0;
+			UInt max = size - 1;
 
 			for(int i = 0; i < 4; i++)
 			{
@@ -1743,7 +1743,7 @@
 
 	void SamplerCore::sampleTexel(Vector4s &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
 	{
-		Int index[4];
+		UInt index[4];
 
 		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
 
@@ -1754,10 +1754,10 @@
 
 		if(has16bitTextureFormat())
 		{
-			c.x = Insert(c.x, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
-			c.x = Insert(c.x, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
-			c.x = Insert(c.x, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
-			c.x = Insert(c.x, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
 
 			switch(state.textureFormat)
 			{
@@ -1776,10 +1776,10 @@
 			{
 			case 4:
 				{
-					Byte4 c0 = *Pointer<Byte4>(buffer[f0] + 4 * index[0]);
-					Byte4 c1 = *Pointer<Byte4>(buffer[f1] + 4 * index[1]);
-					Byte4 c2 = *Pointer<Byte4>(buffer[f2] + 4 * index[2]);
-					Byte4 c3 = *Pointer<Byte4>(buffer[f3] + 4 * index[3]);
+					Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+					Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+					Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+					Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
 					c.x = Unpack(c0, c1);
 					c.y = Unpack(c2, c3);
 
@@ -1819,10 +1819,10 @@
 				break;
 			case 3:
 				{
-					Byte4 c0 = *Pointer<Byte4>(buffer[f0] + 4 * index[0]);
-					Byte4 c1 = *Pointer<Byte4>(buffer[f1] + 4 * index[1]);
-					Byte4 c2 = *Pointer<Byte4>(buffer[f2] + 4 * index[2]);
-					Byte4 c3 = *Pointer<Byte4>(buffer[f3] + 4 * index[3]);
+					Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+					Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+					Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+					Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
 					c.x = Unpack(c0, c1);
 					c.y = Unpack(c2, c3);
 
@@ -1857,10 +1857,10 @@
 				}
 				break;
 			case 2:
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
 
 				switch(state.textureFormat)
 				{
@@ -1896,10 +1896,10 @@
 			switch(textureComponentCount())
 			{
 			case 4:
-				c.x = *Pointer<Short4>(buffer[f0] + 8 * index[0]);
-				c.y = *Pointer<Short4>(buffer[f1] + 8 * index[1]);
-				c.z = *Pointer<Short4>(buffer[f2] + 8 * index[2]);
-				c.w = *Pointer<Short4>(buffer[f3] + 8 * index[3]);
+				c.x = Pointer<Short4>(buffer[f0])[index[0]];
+				c.y = Pointer<Short4>(buffer[f1])[index[1]];
+				c.z = Pointer<Short4>(buffer[f2])[index[2]];
+				c.w = Pointer<Short4>(buffer[f3])[index[3]];
 				transpose4x4(c.x, c.y, c.z, c.w);
 				break;
 			case 2:
@@ -1912,10 +1912,10 @@
 				c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
 				break;
 			case 1:
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
-				c.x = Insert(c.x, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
 				break;
 			default:
 				ASSERT(false);
@@ -1974,25 +1974,25 @@
 			const float G0 = (studioSwing * -16 * Yy - 128 * Gu - 128 * Gv) / 255;
 			const float B0 = (studioSwing * -16 * Yy - 128 * Bu) / 255;
 
-			Int c0 = Int(*Pointer<Byte>(buffer[0] + index[0]));
-			Int c1 = Int(*Pointer<Byte>(buffer[0] + index[1]));
-			Int c2 = Int(*Pointer<Byte>(buffer[0] + index[2]));
-			Int c3 = Int(*Pointer<Byte>(buffer[0] + index[3]));
+			Int c0 = Int(buffer[0][index[0]]);
+			Int c1 = Int(buffer[0][index[1]]);
+			Int c2 = Int(buffer[0][index[2]]);
+			Int c3 = Int(buffer[0][index[3]]);
 			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
 			UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
 
 			computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), function);
-			c0 = Int(*Pointer<Byte>(buffer[1] + index[0]));
-			c1 = Int(*Pointer<Byte>(buffer[1] + index[1]));
-			c2 = Int(*Pointer<Byte>(buffer[1] + index[2]));
-			c3 = Int(*Pointer<Byte>(buffer[1] + index[3]));
+			c0 = Int(buffer[1][index[0]]);
+			c1 = Int(buffer[1][index[1]]);
+			c2 = Int(buffer[1][index[2]]);
+			c3 = Int(buffer[1][index[3]]);
 			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
 			UShort4 V = As<UShort4>(Unpack(As<Byte4>(c0)));
 
-			c0 = Int(*Pointer<Byte>(buffer[2] + index[0]));
-			c1 = Int(*Pointer<Byte>(buffer[2] + index[1]));
-			c2 = Int(*Pointer<Byte>(buffer[2] + index[2]));
-			c3 = Int(*Pointer<Byte>(buffer[2] + index[3]));
+			c0 = Int(buffer[2][index[0]]);
+			c1 = Int(buffer[2][index[1]]);
+			c2 = Int(buffer[2][index[2]]);
+			c3 = Int(buffer[2][index[3]]);
 			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
 			UShort4 U = As<UShort4>(Unpack(As<Byte4>(c0)));
 
@@ -2020,7 +2020,7 @@
 
 	void SamplerCore::sampleTexel(Vector4f &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
 	{
-		Int index[4];
+		UInt index[4];
 
 		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);