Compute fragment coordinates for wider SIMD groups This change removes precomputing a 'quad' of the primitive's origin coordinates, instead just storing the coordinates of the primary vertex. For point primitives this corresponds to the PointCoord builtin. Fragments are laid out within a SIMD group as a Z-order curve, also known as Morton order. The new compactEvenBits() function assists with computing the relative x and y coordinates of each fragment and is based on code by Fabian Giesen: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ Note this change removes work from the setup routine, but adds the minor cost of two SIMD additions per subgroup in the pixel routine. If the JIT compiler doesn't already optimize this through loop-invariant code motion, we could precompute it ourselves once per rasterized primitive. Bug: b/237494823 Change-Id: I9f0956e5937a4c8687ba7461291ab8a7a9d67307 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/67448 Kokoro-Result: kokoro <noreply+kokoro@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Sean Risser <srisser@google.com>

commit: db6433790af50c1392147d41d91db094beb2e0c2 [log] [tgz]
author: Nicolas Capens <capn@google.com> Wed Aug 10 22:28:12 2022 -0400
committer: Nicolas Capens <nicolascapens@google.com> Fri Aug 26 16:57:52 2022 +0000
tree: f566eec58f6d8a688a406645ce4fa7710338aa82
parent: 740c330b7300df15fc61765fb166ebaac0862f5a [diff]
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index dc063d7..7cd64b3 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp

@@ -39,11 +39,9 @@
 	int yMin;
 	int yMax;
 
-	float4 xQuad;
-	float4 yQuad;
+	float x0;
+	float y0;
 
-	float pointCoordX;
-	float pointCoordY;
 	float pointSizeInv;
 
 	PlaneEquation z;

diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index 9fccec8..3201dde 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp

@@ -122,7 +122,9 @@
 			x1 = Max(x1, Max(x1a, x1b));
 		}
 
-		yFragment = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
+		// Compute the y coordinate of each fragment in the SIMD group.
+		const auto yMorton = SIMD::Float([](int i) { return float(compactEvenBits(i >> 1)); });  // 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, ...
+		yFragment = SIMD::Float(Float(y)) + yMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, y0)));
 
 		if(interpolateZ())
 		{

diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 8307544..6919cb3 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp

@@ -101,8 +101,8 @@
 	// PointCoord formula reference: https://www.khronos.org/registry/vulkan/specs/1.2/html/vkspec.html#primsrast-points-basic
 	// Note we don't add a 0.5 offset to x and y here (like for fragCoord) because pointCoordX/Y have 0.5 subtracted as part of the viewport transform.
 	SIMD::Float pointSizeInv = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointSizeInv)));
-	routine.pointCoord[0] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(x)) + SIMD::Float(0.0f, 1.0f, 0.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)))));
-	routine.pointCoord[1] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(y)) + SIMD::Float(0.0f, 0.0f, 1.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)))));
+	routine.pointCoord[0] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(x)) + SIMD::Float(0.0f, 1.0f, 0.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)))));
+	routine.pointCoord[1] = SIMD::Float(0.5f) + pointSizeInv * (((SIMD::Float(Float(y)) + SIMD::Float(0.0f, 0.0f, 1.0f, 1.0f)) - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, y0)))));
 
 	routine.setInputBuiltin(spirvShader, spv::BuiltInViewIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
 		assert(builtin.SizeInComponents == 1);

diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 687e69d..eda9151 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp

@@ -20,6 +20,7 @@
 #include "Device/QuadRasterizer.hpp"
 #include "Device/Renderer.hpp"
 #include "System/Debug.hpp"
+#include "System/Math.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
 #include "Vulkan/VkStringify.hpp"
 
@@ -94,7 +95,9 @@
 
 		SIMD::Float rhwCentroid;
 
-		xFragment = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
+		// Compute the x coordinate of each fragment in the SIMD group.
+		const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); });  // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
+		xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
 
 		if(interpolateZ())
 		{

diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index b04ba31..825ab17 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp

@@ -314,21 +314,17 @@
 		Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex, projected.y));
 		Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex, projected.y));
 
-		Float x0 = Float(X0) * (1.0f / subPixF);
-		Float y0 = Float(Y0) * (1.0f / subPixF);
-
-		if(point)
-		{
-			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) = x0;
-			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) = y0;
-		}
-
 		if(line)
 		{
 			X2 = X1 + Y1 - Y0;
 			Y2 = Y1 + X0 - X1;
 		}
 
+		Float x0 = Float(X0) * (1.0f / subPixF);
+		Float y0 = Float(Y0) * (1.0f / subPixF);
+		*Pointer<Float>(primitive + OFFSET(Primitive, x0)) = x0;
+		*Pointer<Float>(primitive + OFFSET(Primitive, y0)) = y0;
+
 		X1 -= X0;
 		Y1 -= Y0;
 
@@ -343,12 +339,6 @@
 
 		Float a = x1 * y2 - x2 * y1;
 
-		Float4 xQuad = Float4(0, 1, 0, 1) - Float4(x0);
-		Float4 yQuad = Float4(0, 0, 1, 1) - Float4(y0);
-
-		*Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16) = xQuad;
-		*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16) = yQuad;
-
 		Float4 M[3];
 
 		M[0] = Float4(0, 0, 0, 0);

diff --git a/src/System/Math.hpp b/src/System/Math.hpp
index 19f45b7..224d740 100644
--- a/src/System/Math.hpp
+++ b/src/System/Math.hpp

@@ -404,6 +404,17 @@
 	return (y == 0.0f) ? +0.0f : y;
 }
 
+inline uint16_t compactEvenBits(uint32_t x)
+{
+	x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	x = (x ^ (x >> 1)) & 0x33333333;  // x = --fe --dc --ba --98 --76 --54 --32 --10
+	x = (x ^ (x >> 2)) & 0x0F0F0F0F;  // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+	x = (x ^ (x >> 4)) & 0x00FF00FF;  // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+	x = (x ^ (x >> 8)) & 0x0000FFFF;  // x = ---- ---- ---- ---- fedc ba98 7654 3210
+
+	return static_cast<uint16_t>(x);
+}
+
 }  // namespace sw
 
 #endif  // sw_Math_hpp
commit	db6433790af50c1392147d41d91db094beb2e0c2	[log] [tgz]
author	Nicolas Capens <capn@google.com>	Wed Aug 10 22:28:12 2022 -0400
committer	Nicolas Capens <nicolascapens@google.com>	Fri Aug 26 16:57:52 2022 +0000
tree	f566eec58f6d8a688a406645ce4fa7710338aa82
parent	740c330b7300df15fc61765fb166ebaac0862f5a [diff]