Compute fragment coordinates for wider SIMD groups

This change removes precomputing a 'quad' of the primitive's
origin coordinates, instead just storing the coordinates of the primary
vertex. For point primitives this corresponds to the PointCoord builtin.

Fragments are laid out within a SIMD group as a Z-order curve, also
known as Morton order. The new compactEvenBits() function assists with
computing the relative x and y coordinates of each fragment and is
based on code by Fabian Giesen: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/

Note this change removes work from the setup routine, but adds the minor
cost of two SIMD additions per subgroup in the pixel routine. If the JIT
compiler doesn't already optimize this through loop-invariant code
motion, we could precompute it ourselves once per rasterized primitive.

Bug: b/237494823
Change-Id: I9f0956e5937a4c8687ba7461291ab8a7a9d67307
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/67448
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Sean Risser <srisser@google.com>
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index dc063d7..7cd64b3 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp
@@ -39,11 +39,9 @@
 	int yMin;
 	int yMax;
 
-	float4 xQuad;
-	float4 yQuad;
+	float x0;
+	float y0;
 
-	float pointCoordX;
-	float pointCoordY;
 	float pointSizeInv;
 
 	PlaneEquation z;
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index 9fccec8..3201dde 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -122,7 +122,9 @@
 			x1 = Max(x1, Max(x1a, x1b));
 		}
 
-		yFragment = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
+		// Compute the y coordinate of each fragment in the SIMD group.
+		const auto yMorton = SIMD::Float([](int i) { return float(compactEvenBits(i >> 1)); });  // 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, ...
+		yFragment = SIMD::Float(Float(y)) + yMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, y0)));
 
 		if(interpolateZ())
 		{
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 8307544..6919cb3 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -101,8 +101,8 @@
 	// PointCoord formula reference: https://www.khronos.org/registry/vulkan/specs/1.2/html/vkspec.html#primsrast-points-basic
 	// Note we don't add a 0.5 offset to x and y here (like for fragCoord) because pointCoordX/Y have 0.5 subtracted as part of the viewport transform.
 	SIMD::Float pointSizeInv = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointSizeInv)));
-	routine.pointCoord[0] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(x)) + SIMD::Float(0.0f, 1.0f, 0.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)))));
-	routine.pointCoord[1] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(y)) + SIMD::Float(0.0f, 0.0f, 1.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)))));
+	routine.pointCoord[0] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(x)) + SIMD::Float(0.0f, 1.0f, 0.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)))));
+	routine.pointCoord[1] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(y)) + SIMD::Float(0.0f, 0.0f, 1.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, y0)))));
 
 	routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
 		assert(builtin.SizeInComponents == 1);
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 687e69d..eda9151 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -20,6 +20,7 @@
 #include "Device/QuadRasterizer.hpp"
 #include "Device/Renderer.hpp"
 #include "System/Debug.hpp"
+#include "System/Math.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
 #include "Vulkan/VkStringify.hpp"
 
@@ -94,7 +95,9 @@
 
 		SIMD::Float rhwCentroid;
 
-		xFragment = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
+		// Compute the x coordinate of each fragment in the SIMD group.
+		const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); });  // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
+		xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
 
 		if(interpolateZ())
 		{
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index b04ba31..825ab17 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -314,21 +314,17 @@
 		Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex, projected.y));
 		Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex, projected.y));
 
-		Float x0 = Float(X0) * (1.0f / subPixF);
-		Float y0 = Float(Y0) * (1.0f / subPixF);
-
-		if(point)
-		{
-			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) = x0;
-			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) = y0;
-		}
-
 		if(line)
 		{
 			X2 = X1 + Y1 - Y0;
 			Y2 = Y1 + X0 - X1;
 		}
 
+		Float x0 = Float(X0) * (1.0f / subPixF);
+		Float y0 = Float(Y0) * (1.0f / subPixF);
+		*Pointer<Float>(primitive + OFFSET(Primitive, x0)) = x0;
+		*Pointer<Float>(primitive + OFFSET(Primitive, y0)) = y0;
+
 		X1 -= X0;
 		Y1 -= Y0;
 
@@ -343,12 +339,6 @@
 
 		Float a = x1 * y2 - x2 * y1;
 
-		Float4 xQuad = Float4(0, 1, 0, 1) - Float4(x0);
-		Float4 yQuad = Float4(0, 0, 1, 1) - Float4(y0);
-
-		*Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16) = xQuad;
-		*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16) = yQuad;
-
 		Float4 M[3];
 
 		M[0] = Float4(0, 0, 0, 0);
diff --git a/src/System/Math.hpp b/src/System/Math.hpp
index 19f45b7..224d740 100644
--- a/src/System/Math.hpp
+++ b/src/System/Math.hpp
@@ -404,6 +404,17 @@
 	return (y == 0.0f) ? +0.0f : y;
 }
 
+inline uint16_t compactEvenBits(uint32_t x)
+{
+	x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	x = (x ^ (x >> 1)) & 0x33333333;  // x = --fe --dc --ba --98 --76 --54 --32 --10
+	x = (x ^ (x >> 2)) & 0x0F0F0F0F;  // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+	x = (x ^ (x >> 4)) & 0x00FF00FF;  // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+	x = (x ^ (x >> 8)) & 0x0000FFFF;  // x = ---- ---- ---- ---- fedc ba98 7654 3210
+
+	return static_cast<uint16_t>(x);
+}
+
 }  // namespace sw
 
 #endif  // sw_Math_hpp