Support 3-component integer formats natively.

Change-Id: Id48bc7a232c50b753da64cb914e75b5d590ae47d
Reviewed-on: https://swiftshader-review.googlesource.com/14369
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Renderer/Surface.cpp b/src/Renderer/Surface.cpp
index 6bcc657..3147177 100644
--- a/src/Renderer/Surface.cpp
+++ b/src/Renderer/Surface.cpp
@@ -3571,9 +3571,11 @@
 		case FORMAT_R32UI:
 			return FORMAT_R32UI;
 		case FORMAT_X16B16G16R16I:
+			return FORMAT_X16B16G16R16I;
 		case FORMAT_A16B16G16R16I:
 			return FORMAT_A16B16G16R16I;
 		case FORMAT_X16B16G16R16UI:
+			return FORMAT_X16B16G16R16UI;
 		case FORMAT_A16B16G16R16UI:
 			return FORMAT_A16B16G16R16UI;
 		case FORMAT_A2R10G10B10:
@@ -3581,9 +3583,11 @@
 		case FORMAT_A16B16G16R16:
 			return FORMAT_A16B16G16R16;
 		case FORMAT_X32B32G32R32I:
+			return FORMAT_X32B32G32R32I;
 		case FORMAT_A32B32G32R32I:
 			return FORMAT_A32B32G32R32I;
 		case FORMAT_X32B32G32R32UI:
+			return FORMAT_X32B32G32R32UI;
 		case FORMAT_A32B32G32R32UI:
 			return FORMAT_A32B32G32R32UI;
 		case FORMAT_G8R8I:
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index c51dbf9..0c86815 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -2001,6 +2001,13 @@
 				c.w = Pointer<Short4>(buffer[f3])[index[3]];
 				transpose4x4(c.x, c.y, c.z, c.w);
 				break;
+			case 3:
+				c.x = Pointer<Short4>(buffer[f0])[index[0]];
+				c.y = Pointer<Short4>(buffer[f1])[index[1]];
+				c.z = Pointer<Short4>(buffer[f2])[index[2]];
+				c.w = Pointer<Short4>(buffer[f3])[index[3]];
+				transpose4x3(c.x, c.y, c.z, c.w);
+				break;
 			case 2:
 				c.x = *Pointer<Short4>(buffer[f0] + 4 * index[0]);
 				c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer[f1] + 4 * index[1])));
@@ -2159,13 +2166,11 @@
 				transpose4x4(c.x, c.y, c.z, c.w);
 				break;
 			case 3:
-				ASSERT(state.textureFormat == FORMAT_X32B32G32R32F);
 				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
 				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
 				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
 				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
 				transpose4x3(c.x, c.y, c.z, c.w);
-				c.w = Float4(1.0f);
 				break;
 			case 2:
 				// FIXME: Optimal shuffling?
diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index ec159fd..5b2c1ae 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -490,6 +490,18 @@
 		row3 = UnpackHigh(tmp0, tmp1);
 	}
 
+	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+	{
+		Int2 tmp0 = UnpackHigh(row0, row1);
+		Int2 tmp1 = UnpackHigh(row2, row3);
+		Int2 tmp2 = UnpackLow(row0, row1);
+		Int2 tmp3 = UnpackLow(row2, row3);
+
+		row0 = UnpackLow(tmp2, tmp3);
+		row1 = UnpackHigh(tmp2, tmp3);
+		row2 = UnpackLow(tmp0, tmp1);
+	}
+
 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
 	{
 		Float4 tmp0 = UnpackLow(row0, row1);
diff --git a/src/Shader/ShaderCore.hpp b/src/Shader/ShaderCore.hpp
index e998bcc..b0ad3a0 100644
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -82,6 +82,7 @@
 	Float4 dot4(const Vector4f &v0, const Vector4f &v1);
 
 	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
 	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);