Add support for configurable subpixel precision

Bug: b/141676114
Change-Id: I47e7d90e14b44533e64d352ecc6440495c0b7d3f
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36597
Presubmit-Ready: Antonio Maiorano <amaiorano@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index de4e9af..2f137b8 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -330,16 +330,17 @@
 			float N = viewport.minDepth;
 			float F = viewport.maxDepth;
 			float Z = F - N;
+			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
 
 			if(context->isDrawTriangle(false))
 			{
 				N += context->depthBias;
 			}
 
-			data->Wx16 = replicate(W * 16);
-			data->Hx16 = replicate(H * 16);
-			data->X0x16 = replicate(X0 * 16 - 8);
-			data->Y0x16 = replicate(Y0 * 16 - 8);
+			data->WxF = replicate(W * subPixF);
+			data->HxF = replicate(H * subPixF);
+			data->X0xF = replicate(X0 * subPixF - subPixF / 2);
+			data->Y0xF = replicate(Y0 * subPixF - subPixF / 2);
 			data->halfPixelX = replicate(0.5f / W);
 			data->halfPixelY = replicate(0.5f / H);
 			data->viewportHeight = abs(viewport.height);
@@ -799,8 +800,10 @@
 			return false;
 		}
 
-		const float W = data.Wx16[0] * (1.0f / 16.0f);
-		const float H = data.Hx16[0] * (1.0f / 16.0f);
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		const float W = data.WxF[0] * (1.0f / subPixF);
+		const float H = data.HxF[0] * (1.0f / subPixF);
 
 		float dx = W * (P1.x / P1.w - P0.x / P0.w);
 		float dy = H * (P1.y / P1.w - P0.y / P0.w);
@@ -1029,8 +1032,10 @@
 			triangle.v1 = triangle.v0;
 			triangle.v2 = triangle.v0;
 
-			triangle.v1.projected.x += iround(16 * 0.5f * pSize);
-			triangle.v2.projected.y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+			triangle.v1.projected.x += iround(subPixF * 0.5f * pSize);
+			triangle.v2.projected.y -= iround(subPixF * 0.5f * pSize) * (data.HxF[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 			return setupRoutine(&primitive, &triangle, &polygon, &data);
 		}
 
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index 9b45f74..11a58ea 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -79,10 +79,10 @@
 		PixelProcessor::Factor factor;
 		unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
 
-		float4 Wx16;
-		float4 Hx16;
-		float4 X0x16;
-		float4 Y0x16;
+		float4 WxF;
+		float4 HxF;
+		float4 X0xF;
+		float4 Y0xF;
 		float4 halfPixelX;
 		float4 halfPixelY;
 		float viewportHeight;
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index 5095cbe..d8595b5 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -290,29 +290,18 @@
 			sRGBtoLinear12_16[i] = (unsigned short)(clamp(sw::sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
 		}
 
-		// VK_SAMPLE_COUNT_4_BIT
-		// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
-		constexpr float sampleLocations4[][2] = {
-			{0.375, 0.125},
-			{0.875, 0.375},
-			{0.125, 0.625},
-			{0.625, 0.875},
-		};
-
-		// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
-		// Convert to our space, with 0,0 in center, and Y+ going up.
 		constexpr float4 X[4] = {
-			sw::replicate(sampleLocations4[0][0] - 0.5f), // -0.125
-			sw::replicate(sampleLocations4[1][0] - 0.5f), // +0.375
-			sw::replicate(sampleLocations4[2][0] - 0.5f), // -0.375
-			sw::replicate(sampleLocations4[3][0] - 0.5f), // +0.125
+			sw::replicate(SampleLocationsX[0]),
+			sw::replicate(SampleLocationsX[1]),
+			sw::replicate(SampleLocationsX[2]),
+			sw::replicate(SampleLocationsX[3]),
 		};
 
 		constexpr float4 Y[4] = {
-			sw::replicate(-(sampleLocations4[0][1] - 0.5f)), // +0.375
-			sw::replicate(-(sampleLocations4[1][1] - 0.5f)), // +0.125
-			sw::replicate(-(sampleLocations4[2][1] - 0.5f)), // -0.125
-			sw::replicate(-(sampleLocations4[3][1] - 0.5f)), // -0.375
+			sw::replicate(SampleLocationsY[0]),
+			sw::replicate(SampleLocationsY[1]),
+			sw::replicate(SampleLocationsY[2]),
+			sw::replicate(SampleLocationsY[3]),
 		};
 
 		for(int q = 0; q < 4; q++)
@@ -332,8 +321,11 @@
 			}
 		}
 
-		const int Xf[4] = {-5, +5, +2, -2};   // Fragment offsets
-		const int Yf[4] = {-2, +2, -5, +5};   // Fragment offsets
+		constexpr auto subPixB = vk::SUBPIXEL_PRECISION_BITS;
+
+		// Reorder sample points for fragment offset computation
+		const int Xf[4] = { toFixedPoint(X[2][0], subPixB), toFixedPoint(X[1][0], subPixB), toFixedPoint(X[3][0], subPixB), toFixedPoint(X[0][0], subPixB) };
+		const int Yf[4] = { toFixedPoint(Y[2][0], subPixB), toFixedPoint(Y[1][0], subPixB), toFixedPoint(Y[3][0], subPixB), toFixedPoint(Y[0][0], subPixB) };
 
 		memcpy(&this->Xf, &Xf, sizeof(Xf));
 		memcpy(&this->Yf, &Yf, sizeof(Yf));
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index 861887c..58c8e10 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -16,6 +16,8 @@
 #define sw_Constants_hpp
 
 #include "System/Types.hpp"
+#include "System/Math.hpp"
+#include "Vulkan/VkConfig.h"
 
 namespace sw
 {
@@ -86,6 +88,36 @@
 		float4 X[4];
 		float4 Y[4];
 
+		// VK_SAMPLE_COUNT_4_BIT
+		// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
+		static constexpr float VkSampleLocations4[][2] = {
+			{0.375, 0.125},
+			{0.875, 0.375},
+			{0.125, 0.625},
+			{0.625, 0.875},
+		};
+
+		// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
+		// Convert to our space, with 0,0 in center, and Y+ going up.
+		static constexpr float SampleLocationsX[4] = {
+			VkSampleLocations4[0][0] - 0.5f,
+			VkSampleLocations4[1][0] - 0.5f,
+			VkSampleLocations4[2][0] - 0.5f,
+			VkSampleLocations4[3][0] - 0.5f,
+		};
+
+		static constexpr float SampleLocationsY[4] = {
+			-(VkSampleLocations4[0][1] - 0.5f),
+			-(VkSampleLocations4[1][1] - 0.5f),
+			-(VkSampleLocations4[2][1] - 0.5f),
+			-(VkSampleLocations4[3][1] - 0.5f),
+		};
+
+		// Compute the yMin and yMax multisample offsets so that they are just
+		// large enough (+/- max range - epsilon) to include sample points
+		static constexpr int yMinMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) - sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+		static constexpr int yMaxMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) + sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+
 		dword maxX[16];
 		dword maxY[16];
 		dword maxZ[16];
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 37cc5d5..7ebd9ef 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -139,8 +139,8 @@
 					Float w = v.w;
 					Float rhw = IfThenElse(w != 0.0f, 1.0f / w, Float(1.0f));
 
-					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,Wx16)));
-					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,Hx16)));
+					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0xF)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,WxF)));
+					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0xF)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,HxF)));
 
 					i++;
 				}
@@ -162,15 +162,19 @@
 			}
 			Until(i >= n)
 
+			constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
+			constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
+			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
 			if(state.multiSample > 1)
 			{
-				yMin = (yMin + 0x0A) >> 4;
-				yMax = (yMax + 0x14) >> 4;
+				yMin = (yMin + Constants::yMinMultiSampleOffset) >> subPixB;
+				yMax = (yMax + Constants::yMaxMultiSampleOffset) >> subPixB;
 			}
 			else
 			{
-				yMin = (yMin + 0x0F) >> 4;
-				yMax = (yMax + 0x0F) >> 4;
+				yMin = (yMin + subPixM) >> subPixB;
+				yMax = (yMax + subPixM) >> subPixB;
 			}
 
 			yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
@@ -213,7 +217,7 @@
 				{
 					Int xMin = *Pointer<Int>(data + OFFSET(DrawData, scissorX0));
 					Int xMax = *Pointer<Int>(data + OFFSET(DrawData, scissorX1));
-					Short x = Short(Clamp((X[0] + 0xF) >> 4, xMin, xMax));
+					Short x = Short(Clamp((X[0] + subPixM) >> subPixB, xMin, xMax));
 
 					For(Int y = yMin - 1, y < yMax + 1, y++)
 					{
@@ -323,8 +327,8 @@
 				Y2 = Y1 + X0 - X1;
 			}
 
-			Float dx = Float(X0) * (1.0f / 16.0f);
-			Float dy = Float(Y0) * (1.0f / 16.0f);
+			Float dx = Float(X0) * (1.0f / subPixF);
+			Float dy = Float(Y0) * (1.0f / subPixF);
 
 			X1 -= X0;
 			Y1 -= Y0;
@@ -332,11 +336,11 @@
 			X2 -= X0;
 			Y2 -= Y0;
 
-			Float x1 = w1 * (1.0f / 16.0f) * Float(X1);
-			Float y1 = w1 * (1.0f / 16.0f) * Float(Y1);
+			Float x1 = w1 * (1.0f / subPixF) * Float(X1);
+			Float y1 = w1 * (1.0f / subPixF) * Float(Y1);
 
-			Float x2 = w2 * (1.0f / 16.0f) * Float(X2);
-			Float y2 = w2 * (1.0f / 16.0f) * Float(Y2);
+			Float x2 = w2 * (1.0f / subPixF) * Float(X2);
+			Float y2 = w2 * (1.0f / subPixF) * Float(Y2);
 
 			Float a = x1 * y2 - x2 * y1;
 
@@ -403,10 +407,10 @@
 
 				if(!point)
 				{
-					Float x1 = Float(X1) * (1.0f / 16.0f);
-					Float y1 = Float(Y1) * (1.0f / 16.0f);
-					Float x2 = Float(X2) * (1.0f / 16.0f);
-					Float y2 = Float(Y2) * (1.0f / 16.0f);
+					Float x1 = Float(X1) * (1.0f / subPixF);
+					Float y1 = Float(Y1) * (1.0f / subPixF);
+					Float x2 = Float(X2) * (1.0f / subPixF);
+					Float y2 = Float(Y2) * (1.0f / subPixF);
 
 					Float D = *Pointer<Float>(data + OFFSET(DrawData,depthRange)) / (x1 * y2 - x2 * y1);
 
@@ -509,8 +513,11 @@
 			Int Y1 = IfThenElse(swap, Yb, Ya);
 			Int Y2 = IfThenElse(swap, Ya, Yb);
 
-			Int y1 = Max((Y1 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
-			Int y2 = Min((Y2 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+			constexpr int subPixB = vk::SUBPIXEL_PRECISION_BITS;
+			constexpr int subPixM = vk::SUBPIXEL_PRECISION_MASK;
+
+			Int y1 = Max((Y1 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+			Int y2 = Min((Y2 + subPixM) >> subPixB, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
 
 			If(y1 < y2)
 			{
@@ -525,11 +532,11 @@
 				Int DX12 = X2 - X1;
 				Int DY12 = Y2 - Y1;
 
-				Int FDX12 = DX12 << 4;
-				Int FDY12 = DY12 << 4;
+				Int FDX12 = DX12 << subPixB;
+				Int FDY12 = DY12 << subPixB;
 
-				Int X = DX12 * ((y1 << 4) - Y1) + (X1 & 0x0000000F) * DY12;
-				Int x = (X1 >> 4) + X / FDY12;   // Edge
+				Int X = DX12 * ((y1 << subPixB) - Y1) + (X1 & subPixM) * DY12;
+				Int x = (X1 >> subPixB) + X / FDY12;   // Edge
 				Int d = X % FDY12;               // Error-term
 				Int ceil = -d >> 31;             // Ceiling division: remainder <= 0
 				x -= ceil;
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index acc13cd..baa69e4 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -543,8 +543,8 @@
 		Float4 rhw = Float4(1.0f) / w;
 
 		Vector4f proj;
-		proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
-		proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
+		proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,WxF))));
+		proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,HxF))));
 		proj.z = pos.z * rhw;
 		proj.w = rhw;
 
diff --git a/src/System/Math.hpp b/src/System/Math.hpp
index 7bd3890..efef5fd 100644
--- a/src/System/Math.hpp
+++ b/src/System/Math.hpp
@@ -32,37 +32,37 @@
 	#undef max
 
 	template<class T>
-	inline T max(T a, T b)
+	inline T constexpr max(T a, T b)
 	{
 		return a > b ? a : b;
 	}
 
 	template<class T>
-	inline T min(T a, T b)
+	inline constexpr T min(T a, T b)
 	{
 		return a < b ? a : b;
 	}
 
 	template<class T>
-	inline T max(T a, T b, T c)
+	inline constexpr T max(T a, T b, T c)
 	{
 		return max(max(a, b), c);
 	}
 
 	template<class T>
-	inline T min(T a, T b, T c)
+	inline constexpr T min(T a, T b, T c)
 	{
 		return min(min(a, b), c);
 	}
 
 	template<class T>
-	inline T max(T a, T b, T c, T d)
+	inline constexpr T max(T a, T b, T c, T d)
 	{
 		return max(max(a, b), max(c, d));
 	}
 
 	template<class T>
-	inline T min(T a, T b, T c, T d)
+	inline constexpr T min(T a, T b, T c, T d)
 	{
 		return min(min(a, b), min(c, d));
 	}
@@ -372,6 +372,11 @@
 	{
 		return static_cast<int>(min(x, 0x7FFFFFFFu));
 	}
+
+	// Convert floating value v to fixed point with p digits after the decimal point
+	constexpr int toFixedPoint(float v, int p) {
+		return static_cast<int>(v * (1 << p));
+	}
 }
 
 #endif   // sw_Math_hpp
diff --git a/src/Vulkan/VkConfig.h b/src/Vulkan/VkConfig.h
index 905fcf2..3bc9819 100644
--- a/src/Vulkan/VkConfig.h
+++ b/src/Vulkan/VkConfig.h
@@ -78,6 +78,10 @@
 	MAX_POINT_SIZE = 1,		// Large points are not supported. If/when we turn this on, must be >= 64.
 };
 
+constexpr int SUBPIXEL_PRECISION_BITS = 4;
+constexpr float SUBPIXEL_PRECISION_FACTOR = static_cast<float>(1 << SUBPIXEL_PRECISION_BITS);
+constexpr int SUBPIXEL_PRECISION_MASK = 0xFFFFFFFF >> (32 - SUBPIXEL_PRECISION_BITS);
+
 }
 
 #endif // VK_CONFIG_HPP_
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 2ad8cf8..44a2ba4 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -200,7 +200,7 @@
 		{ 65535, 65535, 65535 }, // maxComputeWorkGroupCount[3]
 		128, // maxComputeWorkGroupInvocations
 		{ 128, 128, 64, }, // maxComputeWorkGroupSize[3]
-		4, // subPixelPrecisionBits
+		vk::SUBPIXEL_PRECISION_BITS, // subPixelPrecisionBits
 		4, // subTexelPrecisionBits
 		4, // mipmapPrecisionBits
 		UINT32_MAX, // maxDrawIndexedIndexValue