Vulkan: Optimize shaderClipDistance and shaderCullDistance

Only process clip/cull distances if they're actually outputted from the vertex shader.

There's still overhead of having these distances stored in each vertex / primitive, but we should optimize the structure sizes as a larger set of changes.

Bug: b/139207336
Tests: dEQP-VK.clipping.*
Change-Id: I8f04b1c3ea823bb1a8cf62f18c987e01cd0c979a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35032
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
index e38991f..26731a2 100644
--- a/src/Device/PixelProcessor.cpp
+++ b/src/Device/PixelProcessor.cpp
@@ -154,6 +154,9 @@
 	{
 		State state;
 
+		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
 		if(context->pixelShader)
 		{
 			state.shaderID = context->pixelShader->getSerialID();
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
index d6d8736..f657a59 100644
--- a/src/Device/PixelProcessor.hpp
+++ b/src/Device/PixelProcessor.hpp
@@ -63,6 +63,9 @@
 
 			uint64_t shaderID;
 
+			unsigned int numClipDistances;
+			unsigned int numCullDistances;
+
 			VkCompareOp depthCompareMode;
 			bool depthWriteEnable;
 
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index ca4dc2d..a3494d8 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -158,13 +158,13 @@
 						}
 					}
 
-					for (int i = 0; i < MAX_CLIP_DISTANCES; i++)
+					for (unsigned int i = 0; i < state.numClipDistances; i++)
 					{
 						DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
 									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
 					}
 
-					for (int i = 0; i < MAX_CULL_DISTANCES; i++)
+					for (unsigned int i = 0; i < state.numCullDistances; i++)
 					{
 						DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
 									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
index 61634af..69371ab 100644
--- a/src/Device/SetupProcessor.cpp
+++ b/src/Device/SetupProcessor.cpp
@@ -81,6 +81,9 @@
 		state.multiSample = context->sampleCount;
 		state.rasterizerDiscard = context->rasterizerDiscard;
 
+		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
 		if (context->pixelShader)
 		{
 			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
index 335a6b2..683c93c 100644
--- a/src/Device/SetupProcessor.hpp
+++ b/src/Device/SetupProcessor.hpp
@@ -51,6 +51,8 @@
 			VkCullModeFlags cullMode       : BITS(VK_CULL_MODE_FLAG_BITS_MAX_ENUM);
 			unsigned int multiSample       : 3;   // 1, 2 or 4
 			bool rasterizerDiscard         : 1;
+			unsigned int numClipDistances  : 4; // [0 - 8]
+			unsigned int numCullDistances  : 4; // [0 - 8]
 
 			SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
 		};
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index eb60628..999afe8 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -163,7 +163,7 @@
 
 				setBuiltins(x, y, z, w, cMask);
 
-				for (uint32_t i = 0; i < MAX_CLIP_DISTANCES; i++)
+				for (uint32_t i = 0; i < state.numClipDistances; i++)
 				{
 					auto distance = interpolate(xxxx, DclipDistance[i], rhw,
 												primitive + OFFSET(Primitive, clipDistance[i]),
@@ -197,12 +197,15 @@
 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
 					if(it != spirvShader->inputBuiltins.end())
 					{
-						for (uint32_t i = 0; i < it->second.SizeInComponents; i++)
+						for (uint32_t i = 0; i < state.numCullDistances; i++)
 						{
-							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
-									interpolate(xxxx, DcullDistance[i], rhw,
-												primitive + OFFSET(Primitive, cullDistance[i]),
-												false, true, false);
+							if (i < it->second.SizeInComponents)
+							{
+								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
+										interpolate(xxxx, DcullDistance[i], rhw,
+													primitive + OFFSET(Primitive, cullDistance[i]),
+													false, true, false);
+							}
 						}
 					}
 				}
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 273f02c..dbe9feb 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -456,7 +456,7 @@
 				}
 			}
 
-			for (int i = 0; i < MAX_CLIP_DISTANCES; i++)
+			for (unsigned int i = 0; i < state.numClipDistances; i++)
 			{
 				setupGradient(primitive, tri, w012, M, v0, v1, v2,
 						OFFSET(Vertex, clipDistance[i]),
@@ -464,7 +464,7 @@
 						false, true);
 			}
 
-			for (int i = 0; i < MAX_CULL_DISTANCES; i++)
+			for (unsigned int i = 0; i < state.numCullDistances; i++)
 			{
 				setupGradient(primitive, tri, w012, M, v0, v1, v2,
 						OFFSET(Vertex, cullDistance[i]),
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index a99fc6f..ad0f740 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -508,6 +508,36 @@
 			return capabilities;
 		}
 
+		// getNumOutputClipDistances() returns the number of ClipDistances
+		// outputted by this shader.
+		unsigned int getNumOutputClipDistances() const
+		{
+			if (getUsedCapabilities().ClipDistance)
+			{
+				auto it = outputBuiltins.find(spv::BuiltInClipDistance);
+				if(it != outputBuiltins.end())
+				{
+					return it->second.SizeInComponents;
+				}
+			}
+			return 0;
+		}
+
+		// getNumOutputCullDistances() returns the number of CullDistances
+		// outputted by this shader.
+		unsigned int getNumOutputCullDistances() const
+		{
+			if (getUsedCapabilities().CullDistance)
+			{
+				auto it = outputBuiltins.find(spv::BuiltInCullDistance);
+				if(it != outputBuiltins.end())
+				{
+					return it->second.SizeInComponents;
+				}
+			}
+			return 0;
+		}
+
 		enum AttribType : unsigned char
 		{
 			ATTRIBTYPE_FLOAT,
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 317c4c6..2fccb08 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -148,18 +148,15 @@
 	{
 		cullMask = Int(15);
 
-		if (spirvShader->getUsedCapabilities().CullDistance)
+		auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+		if (it != spirvShader->outputBuiltins.end())
 		{
-			auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
-			if (it != spirvShader->outputBuiltins.end())
+			auto count = spirvShader->getNumOutputCullDistances();
+			for (uint32_t i = 0; i < count; i++)
 			{
-				auto &var = routine.getVariable(it->second.Id);
-				for (uint32_t i = 0; i < it->second.SizeInComponents; i++)
-				{
-					auto const &distance = var[it->second.FirstComponent + i];
-					auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
-					cullMask &= mask;
-				}
+				auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+				auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
+				cullMask &= mask;
 			}
 		}
 	}
@@ -594,55 +591,33 @@
 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
 		}
 
-		uint32_t clipIndex = 0;
-		if (spirvShader->getUsedCapabilities().ClipDistance)
+		it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
+		if(it != spirvShader->outputBuiltins.end())
 		{
-			it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
-			if(it != spirvShader->outputBuiltins.end())
+			auto count = spirvShader->getNumOutputClipDistances();
+			for(unsigned int i = 0; i < count; i++)
 			{
-				ASSERT(it->second.SizeInComponents <= MAX_CLIP_DISTANCES);
-				for(; clipIndex < it->second.SizeInComponents; clipIndex++)
-				{
-					auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + clipIndex];
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 3);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 2);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 1);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 0);
-				}
+				auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 3);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 2);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 1);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[i])) = Extract(dist, 0);
 			}
 		}
-		for(; clipIndex < MAX_CLIP_DISTANCES; clipIndex++)
-		{
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
-		}
 
-		uint32_t cullIndex = 0;
-		if (spirvShader->getUsedCapabilities().CullDistance)
+		it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+		if(it != spirvShader->outputBuiltins.end())
 		{
-			it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
-			if(it != spirvShader->outputBuiltins.end())
+			auto count = spirvShader->getNumOutputCullDistances();
+			for(unsigned int i = 0; i < count; i++)
 			{
-				ASSERT(it->second.SizeInComponents <= MAX_CULL_DISTANCES);
-				for(; cullIndex < it->second.SizeInComponents; cullIndex++)
-				{
-					auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + cullIndex];
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 3);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 2);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 1);
-					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 0);
-				}
+				auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 3);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 2);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 1);
+				*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[i])) = Extract(dist, 0);
 			}
 		}
-		for(; cullIndex < MAX_CULL_DISTANCES; cullIndex++)
-		{
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
-			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
-		}
 
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
@@ -700,11 +675,11 @@
 				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
 			}
 		}
-		for(int i = 0; i < MAX_CLIP_DISTANCES; i++)
+		for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
 		{
 			*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
 		}
-		for(int i = 0; i < MAX_CULL_DISTANCES; i++)
+		for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
 		{
 			*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
 		}