Fix image sampling with divergent LOD

Currently our SamplerCore code performs sampling for four SIMD lanes
simultaneously. With implicit LOD calculation for fragment shaders, all
four pixels in a quad share the same LOD and thus sample from the same
mipmap level. But for the vertex shader the LOD is always explicitly
provided, and can vary significantly between completely unrelated
vertices. Previously we only used the LOD of the first one in each group
of four.

As a workaround, process explicit-lod sampling instructions in a
lane-by-lane manner.

Bug: b/133868964
Tests: dEQP-VK.glsl.texture_functions.*
Change-Id: If4e0d3c04d29529300111d73801124080cb4b544
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32488
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Device/Sampler.hpp b/src/Device/Sampler.hpp
index 73afd67..c5e8f2a 100644
--- a/src/Device/Sampler.hpp
+++ b/src/Device/Sampler.hpp
@@ -65,7 +65,7 @@
 		TEXTURE_2D,
 		TEXTURE_3D,
 		TEXTURE_CUBE,
-		TEXTURE_1D_ARRAY,   // Treated as 2D texture with second coordinate 0.
+		TEXTURE_1D_ARRAY,   // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
 		TEXTURE_2D_ARRAY,
 		TEXTURE_CUBE_ARRAY,
 
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index db62f62..864a640 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -85,24 +85,6 @@
 
 		state.shaderID = context->vertexShader->getSerialID();
 
-		switch(context->topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-			state.verticesPerPrimitive = 1;
-			break;
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			state.verticesPerPrimitive = 2;
-			break;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			state.verticesPerPrimitive = 3;
-			break;
-		default:
-			UNIMPLEMENTED("topology %d", int(context->topology));
-		}
-
 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
 		{
 			state.input[i].type = context->input[i].type;
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index b5d7353..025c165 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -50,9 +50,6 @@
 
 			uint64_t shaderID;
 
-			bool textureSampling           : 1;   // TODO: Eliminate by querying shader.
-			unsigned char verticesPerPrimitive                : 2; // 1 (points), 2 (lines) or 3 (triangles)
-
 			struct Input
 			{
 				operator bool() const   // Returns true if stream contains data
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 9b89b34..6e9b18f 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -53,7 +53,7 @@
 	{
 	}
 
-	Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &lodOrBias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, SamplerFunction function)
 	{
 		Vector4f c;
 
@@ -103,17 +103,17 @@
 
 			if(function == Bias)
 			{
-				lod += lodOrBias.x;
+				lod += lodOrBias;
 			}
 		}
 		else if(function == Lod)
 		{
-			lod = lodOrBias.x;
+			lod = lodOrBias;
 		}
 		else if(function == Fetch)
 		{
 			// TODO: Eliminate int-float-int conversion.
-			lod = Float(As<Int>(Float(lodOrBias.x)));
+			lod = Float(As<Int>(lodOrBias));
 		}
 		else if(function == Base || function == Gather)
 		{
@@ -1031,7 +1031,7 @@
 		return lod;
 	}
 
-	void SamplerCore::computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+	void SamplerCore::computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy, SamplerFunction function)
 	{
 		Float4 duvdxy;
 
@@ -1041,8 +1041,8 @@
 		}
 		else
 		{
-			Float4 dudxy = Float4(dsx.x.xx, dsy.x.xx);
-			Float4 dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+			Float4 dudxy = Float4(dsx.xx, dsy.xx);
+			Float4 dvdxy = Float4(dsx.yy, dsy.yy);
 
 			duvdxy = Float4(dudxy.xz, dvdxy.xz);
 		}
@@ -1077,7 +1077,7 @@
 		lod = log2sqrt(lod);   // log2(sqrt(lod))
 	}
 
-	void SamplerCore::computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function)
+	void SamplerCore::computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function)
 	{
 		Float4 dudxy, dvdxy, dsdxy;
 
@@ -1093,9 +1093,9 @@
 		}
 		else
 		{
-			dudxy = Float4(dsx.x.xx, dsy.x.xx);
-			dvdxy = Float4(dsx.y.xx, dsy.y.xx);
-			dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+			dudxy = Float4(dsx.xx, dsy.xx);
+			dvdxy = Float4(dsx.yy, dsy.yy);
+			dsdxy = Float4(dsx.zz, dsy.zz);
 
 			dudxy = Abs(dudxy * Float4(M.x));
 			dvdxy = Abs(dvdxy * Float4(M.x));
@@ -1118,7 +1118,7 @@
 		lod = log2(lod);
 	}
 
-	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy, SamplerFunction function)
 	{
 		Float4 dudxy, dvdxy, dsdxy;
 
@@ -1130,9 +1130,9 @@
 		}
 		else
 		{
-			dudxy = Float4(dsx.x.xx, dsy.x.xx);
-			dvdxy = Float4(dsx.y.xx, dsy.y.xx);
-			dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+			dudxy = Float4(dsx.xx, dsy.xx);
+			dvdxy = Float4(dsx.yy, dsy.yy);
+			dsdxy = Float4(dsx.zz, dsy.zz);
 		}
 
 		// Scale by texture dimensions.
@@ -2344,7 +2344,7 @@
 	{
 		return (state.textureType == TEXTURE_3D) ||
 		       (state.textureType == TEXTURE_2D_ARRAY) ||
-		       (state.textureType == TEXTURE_1D_ARRAY);  // Treated as 2D texture with second coordinate 0.
+		       (state.textureType == TEXTURE_1D_ARRAY);  // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
 	}
 
 	bool SamplerCore::has16bitTextureFormat() const
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index 17aaab5..a516a5c 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -62,7 +62,7 @@
 	public:
 		SamplerCore(Pointer<Byte> &constants, const Sampler &state);
 
-		Vector4f sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &lodOrBias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+		Vector4f sampleTexture(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4f &offset, SamplerFunction function);
 
 	private:
 		Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
@@ -78,9 +78,9 @@
 		Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
 		Float log2sqrt(Float lod);
 		Float log2(Float lod);
-		void computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
-		void computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function);
-		void computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
+		void computeLod(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float4 &dsx, Float4 &dsy, SamplerFunction function);
+		void computeLodCube(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function);
+		void computeLod3D(Pointer<Byte> &texture, Pointer<Byte> &sampler, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, SamplerFunction function);
 		Int4 cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
 		Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
 		void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function);
diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index 72f3d77..c80228b 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp
@@ -145,7 +145,7 @@
 			i++;
 		}
 
-		// TODO(b/129523279): Currently 1D textures are treated as 2D by setting the second coordinate to 0.
+		// TODO(b/134669567): Currently 1D textures are treated as 2D by setting the second coordinate to 0.
 		// Implement optimized 1D sampling.
 		if(samplerState.textureType == TEXTURE_1D)
 		{
@@ -184,13 +184,52 @@
 		}
 
 		SamplerCore s(constants, samplerState);
-		Vector4f sample = s.sampleTexture(texture, sampler, uvw[0], uvw[1], uvw[2], q, lodOrBias, dsx, dsy, offset, samplerFunction);
 
-		Pointer<SIMD::Float> rgba = out;
-		rgba[0] = sample.x;
-		rgba[1] = sample.y;
-		rgba[2] = sample.z;
-		rgba[3] = sample.w;
+		// For explicit-lod instructions the LOD can be different per SIMD lane. SamplerCore currently assumes
+		// a single LOD per four elements, so we sample the image again for each LOD separately.
+		if(samplerFunction.method == Lod || samplerFunction.method == Grad)  // TODO(b/133868964): Also handle divergent Bias and Fetch with Lod.
+		{
+			auto lod = Pointer<Float>(&lodOrBias);
+
+			For(Int i = 0, i < SIMD::Width, i++)
+			{
+				SIMD::Float dPdx;
+				SIMD::Float dPdy;
+
+				dPdx.x = Pointer<Float>(&dsx.x)[i];
+				dPdx.y = Pointer<Float>(&dsx.y)[i];
+				dPdx.z = Pointer<Float>(&dsx.z)[i];
+
+				dPdy.x = Pointer<Float>(&dsy.x)[i];
+				dPdy.y = Pointer<Float>(&dsy.y)[i];
+				dPdy.z = Pointer<Float>(&dsy.z)[i];
+
+				// 1D textures are treated as 2D texture with second coordinate 0, so we also need to zero out the second grad component. TODO(b/134669567)
+				if(samplerState.textureType == TEXTURE_1D || samplerState.textureType == TEXTURE_1D_ARRAY)
+				{
+					dPdx.y = Float(0.0f);
+					dPdy.y = Float(0.0f);
+				}
+
+				Vector4f sample = s.sampleTexture(texture, sampler, uvw[0], uvw[1], uvw[2], q, lod[i], dPdx, dPdy, offset, samplerFunction);
+
+				Pointer<Float> rgba = out;
+				rgba[0 * SIMD::Width + i] = Pointer<Float>(&sample.x)[i];
+				rgba[1 * SIMD::Width + i] = Pointer<Float>(&sample.y)[i];
+				rgba[2 * SIMD::Width + i] = Pointer<Float>(&sample.z)[i];
+				rgba[3 * SIMD::Width + i] = Pointer<Float>(&sample.w)[i];
+			}
+		}
+		else
+		{
+			Vector4f sample = s.sampleTexture(texture, sampler, uvw[0], uvw[1], uvw[2], q, lodOrBias.x, (dsx.x), (dsy.x), offset, samplerFunction);
+
+			Pointer<SIMD::Float> rgba = out;
+			rgba[0] = sample.x;
+			rgba[1] = sample.y;
+			rgba[2] = sample.z;
+			rgba[3] = sample.w;
+		}
 	}
 
 	return (ImageSampler*)function("sampler")->getEntry();
@@ -292,7 +331,7 @@
 		}
 		break;
 
-	case VK_IMAGE_VIEW_TYPE_1D:  // Treated as 2D texture with second coordinate 0.
+	case VK_IMAGE_VIEW_TYPE_1D:  // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
 		if(coordinateIndex == 1)
 		{
 			return ADDRESSING_WRAP;
@@ -310,7 +349,7 @@
 		}
 		break;
 
-	case VK_IMAGE_VIEW_TYPE_1D_ARRAY:  // Treated as 2D texture with second coordinate 0.
+	case VK_IMAGE_VIEW_TYPE_1D_ARRAY:  // Treated as 2D texture with second coordinate 0. TODO(b/134669567)
 		if(coordinateIndex == 1)
 		{
 			return ADDRESSING_WRAP;
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 75ac6be..baabb2d 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -41,8 +41,6 @@
 
 	void VertexRoutine::generate()
 	{
-		const bool textureSampling = state.textureSampling;
-
 		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
 		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
@@ -55,7 +53,7 @@
 		{
 			UInt index = *Pointer<UInt>(batch);
 			UInt tagIndex = index & 0x0000003C;
-			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
+			UInt indexQ = index & 0xFFFFFFFC;
 
 			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
 			{
@@ -139,14 +137,12 @@
 
 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
 	{
-		const bool textureSampling = state.textureSampling;
-
 		Vector4f v;
 
 		Pointer<Byte> source0 = buffer + index * stride;
-		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
-		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
-		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
+		Pointer<Byte> source1 = source0 + stride;
+		Pointer<Byte> source2 = source1 + stride;
+		Pointer<Byte> source3 = source2 + stride;
 
 		bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;