Device: Don't use a global for the cluster count.

Pass the cluster count to the QuadRasterizer instead of using a rather nasty back dependency to the renderer.

This allows the cluster count to be adjusted per-draw without requiring synchronization around the count.

The primary goal of this change is that we can scale the number of clusters based on the complexity of the draw.

Bug: b/139142453
Change-Id: I0379e16568de402f186ee2dd1e8b2346bed30efd
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35571
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
index 17cda85..460afa4 100644
--- a/src/Device/PixelProcessor.hpp
+++ b/src/Device/PixelProcessor.hpp
@@ -119,7 +119,7 @@
 		};
 
 	public:
-		typedef void (*RoutinePointer)(const Primitive *primitive, int count, int thread, DrawData *draw);
+		typedef void (*RoutinePointer)(const Primitive *primitive, int count, int cluster, int clusterCount, DrawData *draw);
 
 		PixelProcessor();
 
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index ce4dbf1..c756d6a 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -34,7 +34,6 @@
 	{
 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
 		occlusion = 0;
-		int clusterCount = Renderer::getClusterCount();
 
 		Do
 		{
@@ -72,6 +71,8 @@
 		Pointer<Byte> zBuffer;
 		Pointer<Byte> sBuffer;
 
+		Int clusterCountLog2 = 31 - Ctlz(UInt(clusterCount), false);
+
 		for(int index = 0; index < RENDERTARGETS; index++)
 		{
 			if(state.colorWriteActive(index))
@@ -192,24 +193,22 @@
 				}
 			}
 
-			int clusterCount = Renderer::getClusterCount();
-
 			for(int index = 0; index < RENDERTARGETS; index++)
 			{
 				if(state.colorWriteActive(index))
 				{
-					cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + log2i(clusterCount));   // FIXME: Precompute
+					cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + clusterCountLog2);   // FIXME: Precompute
 				}
 			}
 
 			if(state.depthTestActive)
 			{
-				zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + log2i(clusterCount));   // FIXME: Precompute
+				zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 			}
 
 			if(state.stencilActive)
 			{
-				sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + log2i(clusterCount));   // FIXME: Precompute
+				sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 			}
 
 			y += 2 * clusterCount;
diff --git a/src/Device/Rasterizer.hpp b/src/Device/Rasterizer.hpp
index c268e18..a206508 100644
--- a/src/Device/Rasterizer.hpp
+++ b/src/Device/Rasterizer.hpp
@@ -21,16 +21,17 @@
 
 namespace sw
 {
-	class Rasterizer : public Function<Void(Pointer<Byte>, Int, Int, Pointer<Byte>)>
+	class Rasterizer : public Function<Void(Pointer<Byte>, Int, Int, Int, Pointer<Byte>)>
 	{
 	public:
-		Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), data(Arg<3>()) {}
+		Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), clusterCount(Arg<3>()), data(Arg<4>()) {}
 		virtual ~Rasterizer() {}
 
 	protected:
 		Pointer<Byte> primitive;
 		Int count;
 		Int cluster;
+		Int clusterCount;
 		Pointer<Byte> data;
 	};
 }
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 7030897..628d549 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -691,7 +691,7 @@
 					DrawData *data = draw->data;
 					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
 
-					pixelRoutine(primitive, visible, cluster, data);
+					pixelRoutine(primitive, visible, cluster, clusterCount, data);
 				}
 
 				finishRendering(task[threadIndex]);