Compute cube LOD from 3D cube intersection distance.

We used to compute the cube map LOD by projecting all sampling rays onto the
same face (arbitrarily chosen). This changes it to intersect the rays with the
cube and use the 3D distance between them.

Change-Id: If90b29cb4c13e6128ce6642eb28bb92602e783df
Reviewed-on: https://swiftshader-review.googlesource.com/5160
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
Tested-by: Nicolas Capens <capn@google.com>
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index a6f0715..1dc78bf 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -89,12 +89,13 @@
 		else
 		{
 			Int face[4];
-			Float4 lodU;
-			Float4 lodV;
+			Float4 lodX;
+			Float4 lodY;
+			Float4 lodZ;
 
 			if(state.textureType == TEXTURE_CUBE)
 			{
-				cubeFace(face, uuuu, vvvv, lodU, lodV, u, v, w);
+				cubeFace(face, uuuu, vvvv, lodX, lodY, lodZ, u, v, w);
 			}
 
 			Float lod;
@@ -110,7 +111,7 @@
 				}
 				else
 				{
-					computeLod(texture, lod, anisotropy, uDelta, vDelta, lodU, lodV, q.x, dsx, dsy, method);
+					computeLodCube(texture, lod, lodX, lodY, lodZ, q.x, dsx, dsy, method);
 				}
 			}
 			else
@@ -118,12 +119,6 @@
 				computeLod3D(texture, lod, uuuu, vvvv, wwww, q.x, dsx, dsy, method);
 			}
 
-			if(state.textureType == TEXTURE_CUBE)
-			{
-				uuuu += Float4(0.5f);
-				vvvv += Float4(0.5f);
-			}
-
 			if(!hasFloatTexture())
 			{
 				sampleFilter(texture, c, uuuu, vvvv, wwww, lod, anisotropy, uDelta, vDelta, face, method);
@@ -320,12 +315,13 @@
 				Float4 wwww = w;
 
 				Int face[4];
-				Float4 lodU;
-				Float4 lodV;
+				Float4 lodX;
+				Float4 lodY;
+				Float4 lodZ;
 
 				if(state.textureType == TEXTURE_CUBE)
 				{
-					cubeFace(face, uuuu, vvvv, lodU, lodV, u, v, w);
+					cubeFace(face, uuuu, vvvv, lodX, lodY, lodZ, u, v, w);
 				}
 
 				Float lod;
@@ -341,7 +337,7 @@
 					}
 					else
 					{
-						computeLod(texture, lod, anisotropy, uDelta, vDelta, lodU, lodV, q.x, dsx, dsy, method);
+						computeLodCube(texture, lod, lodX, lodY, lodZ, q.x, dsx, dsy, method);
 					}
 				}
 				else
@@ -349,12 +345,6 @@
 					computeLod3D(texture, lod, uuuu, vvvv, wwww, q.x, dsx, dsy, method);
 				}
 
-				if(state.textureType == TEXTURE_CUBE)
-				{
-					uuuu += Float4(0.5f);
-					vvvv += Float4(0.5f);
-				}
-
 				sampleFloatFilter(texture, c, uuuu, vvvv, wwww, lod, anisotropy, uDelta, vDelta, face, method);
 			}
 			else
@@ -1451,11 +1441,64 @@
 			{
 				lod += lodBias;
 			}
+		}
+		else
+		{
+			lod = lodBias + *Pointer<Float>(texture + OFFSET(Texture,LOD));
+		}
 
-			// FIXME: Hack to satisfy WHQL
-			if(state.textureType == TEXTURE_CUBE)
+		lod = Max(lod, 0.0f);
+		lod = Min(lod, Float(MIPMAP_LEVELS - 2));   // Trilinear accesses lod+1
+	}
+
+	void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &s, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerMethod method)
+	{
+		if(method != Lod)
+		{
+			if(method != Grad)
 			{
-				lod += Float(-0.15f);
+				Float4 dudxy = u.ywyw - u;
+				Float4 dvdxy = v.ywyw - v;
+				Float4 dsdxy = s.ywyw - s;
+
+				// Scale by texture dimensions and LOD
+				dudxy *= *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+				dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+				dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+
+				dudxy *= dudxy;
+				dvdxy *= dvdxy;
+				dsdxy *= dsdxy;
+
+				dudxy += dvdxy;
+				dudxy += dsdxy;
+
+				lod = Max(Float(dudxy.x), Float(dudxy.y));   // FIXME: Max(dudxy.x, dudxy.y);
+			}
+			else
+			{
+				Float4 dudxy = Float4(dsx.x.xx, dsy.x.xx);
+				Float4 dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+
+				Float4 duvdxy = Float4(dudxy.xz, dvdxy.xz);
+
+				// Scale by texture dimensions and LOD
+				Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+
+				Float4 dUV2dxy = dUVdxy * dUVdxy;
+				Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
+
+				lod = Max(Float(dUV2.x), Float(dUV2.y));   // Square length of major axis
+			}
+
+			// log2(sqrt(lod))
+			lod = Float(As<Int>(lod));
+			lod -= Float(0x3F800000);
+			lod *= As<Float>(Int(0x33800000));
+
+			if(method == Bias)
+			{
+				lod += lodBias;
 			}
 		}
 		else
@@ -1535,22 +1578,22 @@
 		}
 	}
 
-	void SamplerCore::cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &lodU, Float4 &lodV, Float4 &x, Float4 &y, Float4 &z)
+	void SamplerCore::cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &lodX, Float4 &lodY, Float4 &lodZ, Float4 &x, Float4 &y, Float4 &z)
 	{
-		Int4 xp = CmpNLE(x, Float4(0.0f));   // x > 0
-		Int4 yp = CmpNLE(y, Float4(0.0f));   // y > 0
-		Int4 zp = CmpNLE(z, Float4(0.0f));   // z > 0
+		Int4 xn = CmpLT(x, Float4(0.0f));   // x < 0
+		Int4 yn = CmpLT(y, Float4(0.0f));   // y < 0
+		Int4 zn = CmpLT(z, Float4(0.0f));   // z < 0
 
 		Float4 absX = Abs(x);
 		Float4 absY = Abs(y);
 		Float4 absZ = Abs(z);
+
 		Int4 xy = CmpNLE(absX, absY);   // abs(x) > abs(y)
 		Int4 yz = CmpNLE(absY, absZ);   // abs(y) > abs(z)
 		Int4 zx = CmpNLE(absZ, absX);   // abs(z) > abs(x)
-
-		Int4 xyz = ~zx & xy;   // abs(x) > abs(y) && abs(x) > abs(z)
-		Int4 yzx = ~xy & yz;   // abs(y) > abs(z) && abs(y) > abs(x)
-		Int4 zxy = ~yz & zx;   // abs(z) > abs(x) && abs(z) > abs(y)
+		Int4 xMajor = xy & ~zx;   // abs(x) > abs(y) && abs(x) > abs(z)
+		Int4 yMajor = yz & ~xy;   // abs(y) > abs(z) && abs(y) > abs(x)
+		Int4 zMajor = zx & ~yz;   // abs(z) > abs(x) && abs(z) > abs(y)
 
 		// FACE_POSITIVE_X = 000b
 		// FACE_NEGATIVE_X = 001b
@@ -1559,10 +1602,10 @@
 		// FACE_POSITIVE_Z = 100b
 		// FACE_NEGATIVE_Z = 101b
 
-		Int yAxis = SignMask(yzx);
-		Int zAxis = SignMask(zxy);
+		Int yAxis = SignMask(yMajor);
+		Int zAxis = SignMask(zMajor);
 
-		Int4 n = (~xp & xyz) | (~yp & yzx) | (~zp & zxy);
+		Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
 		Int negative = SignMask(n);
 
 		face[0] = *Pointer<Int>(constants + OFFSET(Constants,transposeBit0) + negative * 4);
@@ -1573,40 +1616,21 @@
 		face[3] = (face[0] >> 12) & 0x7;
 		face[0] &= 0x7;
 
-		// U = xyz * -z + ~xyz * (yzx * ~yp * -x + (yzx * ~yp) * x)
-		U = As<Float4>((xyz & As<Int4>(-z)) | (~xyz & (((yzx & ~yp) & Int4(0x80000000, 0x80000000, 0x80000000, 0x80000000)) ^ As<Int4>(x))));
+		Float4 M = Max(Max(absX, absY), absZ);
 
-		// V = yzx * z + ~yzx * (~neg * -y + neg * y)
-		V = As<Float4>((~yzx & ((~n & Int4(0x80000000, 0x80000000, 0x80000000, 0x80000000)) ^ As<Int4>(y))) | (yzx & As<Int4>(z)));
+		// U = xMajor ? (neg ^ -z) : (zMajor & neg) ^ x)
+		U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
 
-		// M = xyz * x + yzx * y + zxy * z
-		Float4 M = As<Float4>((xyz & As<Int4>(x)) | (yzx & As<Int4>(y)) | (zxy & As<Int4>(z)));
+		// V = !yMajor ? -y : (n ^ z)
+		V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
 
-		M = reciprocal(M);
-		U *= M * Float4(0.5f);
-		V *= M * Float4(0.5f);
+		M = reciprocal(M) * Float4(0.5f);
+		U = U * M + Float4(0.5f);
+		V = V * M + Float4(0.5f);
 
-		// Project coordinates onto one face for consistent LOD calculation
-		{
-			yp = Swizzle(yp, 0);
-			n = Swizzle(n, 0);
-			xyz = Swizzle(xyz, 0);
-			yzx = Swizzle(yzx, 0);
-			zxy = Swizzle(zxy, 0);
-
-			// U = xyz * -z + ~xyz * (yzx * ~yp * -x + (yzx * ~yp) * x)
-			lodU = As<Float4>((xyz & As<Int4>(-z)) | (~xyz & (((yzx & ~yp) & Int4(0x80000000, 0x80000000, 0x80000000, 0x80000000)) ^ As<Int4>(x))));
-
-			// V = yzx * z + ~yzx * (~neg * -y + neg * y)
-			lodV = As<Float4>((~yzx & ((~n & Int4(0x80000000, 0x80000000, 0x80000000, 0x80000000)) ^ As<Int4>(y))) | (yzx & As<Int4>(z)));
-
-			// M = xyz * x + yzx * y + zxy * z
-			Float4 M = As<Float4>((xyz & As<Int4>(x)) | (yzx & As<Int4>(y)) | (zxy & As<Int4>(z)));
-
-			M = Rcp_pp(M);
-			lodU *= M * Float4(0.5f);
-			lodV *= M * Float4(0.5f);
-		}
+		lodX = x * M;
+		lodY = y * M;
+		lodZ = z * M;
 	}
 
 	void SamplerCore::computeIndices(Int index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Pointer<Byte> &mipmap)