Avoid broadcasting data fields to 4-wide vectors To avoid repeated broadcasting of scalars to vectors we stored some data in pre-broadcasted 4-wide vector fields. On modern CPUs such broadcasts no longer consumes valuable cycles, and by storing this data in scalar fields instead we save on memory consumption and bandwidth. This also enables scaling to arbitrary SIMD width. Bug: b/237494823 Change-Id: I5d1db08318452f2a90050ba2ba37751eed8f3121 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/66770 Kokoro-Result: kokoro <noreply+kokoro@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>

commit: 1c8155a85edbc9c708043475b1833ddd899b1d17 [log] [tgz]
author: Nicolas Capens <nicolas.capens@gmail.com> Wed Jun 29 03:13:02 2022 -0400
committer: Nicolas Capens <nicolascapens@google.com> Thu Jun 30 13:34:25 2022 +0000
tree: 1f7bbe3d05242a978bffbdc7b4c904d1bebf21ef
parent: 25dda394503be16ef03c42fdf9a2987d3aba84cc [diff]
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index 8a6672d..dc063d7 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp

@@ -29,9 +29,9 @@
 
 struct PlaneEquation  // z = A * x + B * y + C
 {
-	float4 A;
-	float4 B;
-	float4 C;
+	float A;
+	float B;
+	float C;
 };
 
 struct Primitive
@@ -47,7 +47,7 @@
 	float pointSizeInv;
 
 	PlaneEquation z;
-	float4 zBias;
+	float zBias;
 	PlaneEquation w;
 	PlaneEquation V[MAX_INTERFACE_COMPONENTS];
 

diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index cebab37..e2848c3 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp

@@ -132,10 +132,10 @@
 
 				if(state.enableMultiSampling)
 				{
-					y += *Pointer<Float4>(constants + OFFSET(Constants, Y) + q * sizeof(float4));
+					y += Float4(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsY) + q * sizeof(float)));
 				}
 
-				Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive, z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive, z.B), 16);
+				Dz[q] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, z.C))) + y * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, z.B)));
 			}
 		}
 
@@ -143,7 +143,7 @@
 		{
 			if(interpolateW())
 			{
-				Dw = *Pointer<Float4>(primitive + OFFSET(Primitive, w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, w.B), 16);
+				Dw = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, w.C))) + yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, w.B)));
 			}
 
 			if(spirvShader)
@@ -151,28 +151,28 @@
 				int packedInterpolant = 0;
 				for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
 				{
-					if(spirvShader->inputs[interfaceInterpolant].Type == SpirvShader::ATTRIBTYPE_UNUSED)
-						continue;
-
-					Dv[interfaceInterpolant] = *Pointer<Float4>(primitive + OFFSET(Primitive, V[packedInterpolant].C), 16);
-					if(!spirvShader->inputs[interfaceInterpolant].Flat)
+					if(spirvShader->inputs[interfaceInterpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 					{
-						Dv[interfaceInterpolant] +=
-						    yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[packedInterpolant].B), 16);
+						Dv[interfaceInterpolant] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].C)));
+						if(!spirvShader->inputs[interfaceInterpolant].Flat)
+						{
+							Dv[interfaceInterpolant] +=
+							    yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].B)));
+						}
+						packedInterpolant++;
 					}
-					packedInterpolant++;
 				}
 
 				for(unsigned int i = 0; i < state.numClipDistances; i++)
 				{
-					DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
-					                   yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
+					DclipDistance[i] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].C))) +
+					                   yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].B)));
 				}
 
 				for(unsigned int i = 0; i < state.numCullDistances; i++)
 				{
-					DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
-					                   yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
+					DcullDistance[i] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].C))) +
+					                   yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].B)));
 				}
 			}
 
@@ -237,7 +237,7 @@
 		return D;
 	}
 
-	Float4 interpolant = mulAdd(x, *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16), D);
+	Float4 interpolant = mulAdd(x, Float4(*Pointer<Float>(planeEquation + OFFSET(PlaneEquation, A))), D);
 
 	if(perspective)
 	{

diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 4a48f9c..1435809 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp

@@ -304,19 +304,19 @@
 	{
 		if(ms == 4)
 		{
-			data->a2c0 = float4(0.2f);
-			data->a2c1 = float4(0.4f);
-			data->a2c2 = float4(0.6f);
-			data->a2c3 = float4(0.8f);
+			data->a2c0 = 0.2f;
+			data->a2c1 = 0.4f;
+			data->a2c2 = 0.6f;
+			data->a2c3 = 0.8f;
 		}
 		else if(ms == 2)
 		{
-			data->a2c0 = float4(0.25f);
-			data->a2c1 = float4(0.75f);
+			data->a2c0 = 0.25f;
+			data->a2c1 = 0.75f;
 		}
 		else if(ms == 1)
 		{
-			data->a2c0 = float4(0.5f);
+			data->a2c0 = 0.5f;
 		}
 		else
 			ASSERT(false);
@@ -343,12 +343,12 @@
 		float Z = F - N;
 		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
 
-		data->WxF = float4(W * subPixF);
-		data->HxF = float4(H * subPixF);
-		data->X0xF = float4(X0 * subPixF - subPixF / 2);
-		data->Y0xF = float4(Y0 * subPixF - subPixF / 2);
-		data->halfPixelX = float4(0.5f / W);
-		data->halfPixelY = float4(0.5f / H);
+		data->WxF = W * subPixF;
+		data->HxF = H * subPixF;
+		data->X0xF = X0 * subPixF - subPixF / 2;
+		data->Y0xF = Y0 * subPixF - subPixF / 2;
+		data->halfPixelX = 0.5f / W;
+		data->halfPixelY = 0.5f / H;
 		data->viewportHeight = abs(viewport.height);
 		data->depthRange = Z;
 		data->depthNear = N;
@@ -870,8 +870,8 @@
 
 	constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
 
-	const float W = data.WxF[0] * (1.0f / subPixF);
-	const float H = data.HxF[0] * (1.0f / subPixF);
+	const float W = data.WxF * (1.0f / subPixF);
+	const float H = data.HxF * (1.0f / subPixF);
 
 	float dx = W * (P1.x / P1.w - P0.x / P0.w);
 	float dy = H * (P1.y / P1.w - P0.y / P0.w);
@@ -1166,8 +1166,8 @@
 	P[2] = v.position;
 	P[3] = v.position;
 
-	const float X = pSize * P[0].w * data.halfPixelX[0];
-	const float Y = pSize * P[0].w * data.halfPixelY[0];
+	const float X = pSize * P[0].w * data.halfPixelX;
+	const float Y = pSize * P[0].w * data.halfPixelY;
 
 	P[0].x -= X;
 	P[0].y += Y;

diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index 5923dec..1c155e5 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp

@@ -74,12 +74,12 @@
 	PixelProcessor::Factor factor;
 	unsigned int occlusion[MaxClusterCount];  // Number of pixels passing depth test
 
-	float4 WxF;
-	float4 HxF;
-	float4 X0xF;
-	float4 Y0xF;
-	float4 halfPixelX;
-	float4 halfPixelY;
+	float WxF;
+	float HxF;
+	float X0xF;
+	float Y0xF;
+	float halfPixelX;
+	float halfPixelY;
 	float viewportHeight;
 	float depthRange;
 	float depthNear;
@@ -104,10 +104,10 @@
 	int scissorY0;
 	int scissorY1;
 
-	float4 a2c0;
-	float4 a2c1;
-	float4 a2c2;
-	float4 a2c3;
+	float a2c0;
+	float a2c1;
+	float a2c2;
+	float a2c3;
 
 	vk::Pipeline::PushConstantStorage pushConstants;
 };

diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index 6c870e5..80cded2 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp

@@ -21,10 +21,6 @@
 
 namespace sw {
 
-constexpr float Constants::VkSampleLocations4[][2];
-constexpr float Constants::SampleLocationsX[4];
-constexpr float Constants::SampleLocationsY[4];
-
 Constants::Constants()
 {
 	static const unsigned int transposeBit0[16] = {
@@ -287,28 +283,14 @@
 		sRGBtoLinear12_16[i] = (unsigned short)(clamp(sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
 	}
 
-	const float4 X[4] = {
-		float4(SampleLocationsX[0]),
-		float4(SampleLocationsX[1]),
-		float4(SampleLocationsX[2]),
-		float4(SampleLocationsX[3]),
-	};
-
-	const float4 Y[4] = {
-		float4(SampleLocationsY[0]),
-		float4(SampleLocationsY[1]),
-		float4(SampleLocationsY[2]),
-		float4(SampleLocationsY[3]),
-	};
-
 	for(int q = 0; q < 4; q++)
 	{
 		for(int c = 0; c < 16; c++)
 		{
 			for(int i = 0; i < 4; i++)
 			{
-				sampleX[q][c][i] = c & (1 << i) ? X[q][0] : 0.0f;
-				sampleY[q][c][i] = c & (1 << i) ? Y[q][0] : 0.0f;
+				sampleX[q][c][i] = c & (1 << i) ? SampleLocationsX[q] : 0.0f;
+				sampleY[q][c][i] = c & (1 << i) ? SampleLocationsY[q] : 0.0f;
 				weight[c][i] = c & (1 << i) ? 1.0f : 0.0f;
 			}
 		}
@@ -316,15 +298,12 @@
 
 	constexpr auto subPixB = vk::SUBPIXEL_PRECISION_BITS;
 
-	const int Xf[4] = { toFixedPoint(X[0][0], subPixB), toFixedPoint(X[1][0], subPixB), toFixedPoint(X[2][0], subPixB), toFixedPoint(X[3][0], subPixB) };
-	const int Yf[4] = { toFixedPoint(Y[0][0], subPixB), toFixedPoint(Y[1][0], subPixB), toFixedPoint(Y[2][0], subPixB), toFixedPoint(Y[3][0], subPixB) };
+	const int Xf[4] = { toFixedPoint(SampleLocationsX[0], subPixB), toFixedPoint(SampleLocationsX[1], subPixB), toFixedPoint(SampleLocationsX[2], subPixB), toFixedPoint(SampleLocationsX[3], subPixB) };
+	const int Yf[4] = { toFixedPoint(SampleLocationsY[0], subPixB), toFixedPoint(SampleLocationsY[1], subPixB), toFixedPoint(SampleLocationsY[2], subPixB), toFixedPoint(SampleLocationsY[3], subPixB) };
 
 	memcpy(&this->Xf, &Xf, sizeof(Xf));
 	memcpy(&this->Yf, &Yf, sizeof(Yf));
 
-	memcpy(&this->X, &X, sizeof(X));
-	memcpy(&this->Y, &Y, sizeof(Y));
-
 	const dword maxX[16] = { 0x00000000, 0x00000001, 0x00000100, 0x00000101, 0x00010000, 0x00010001, 0x00010100, 0x00010101, 0x01000000, 0x01000001, 0x01000100, 0x01000101, 0x01010000, 0x01010001, 0x01010100, 0x01010101 };
 	const dword maxY[16] = { 0x00000000, 0x00000002, 0x00000200, 0x00000202, 0x00020000, 0x00020002, 0x00020200, 0x00020202, 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02020000, 0x02020002, 0x02020200, 0x02020202 };
 	const dword maxZ[16] = { 0x00000000, 0x00000004, 0x00000400, 0x00000404, 0x00040000, 0x00040004, 0x00040400, 0x00040404, 0x04000000, 0x04000004, 0x04000400, 0x04000404, 0x04040000, 0x04040004, 0x04040400, 0x04040404 };

diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index 390ce72..6cbfd34 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp

@@ -21,6 +21,36 @@
 
 namespace sw {
 
+// VK_SAMPLE_COUNT_4_BIT
+// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
+static constexpr float VkSampleLocations4[][2] = {
+	{ 0.375, 0.125 },
+	{ 0.875, 0.375 },
+	{ 0.125, 0.625 },
+	{ 0.625, 0.875 },
+};
+
+// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
+// Convert to our space, with 0,0 in center, and Y+ going up.
+static constexpr float SampleLocationsX[4] = {
+	VkSampleLocations4[0][0] - 0.5f,
+	VkSampleLocations4[1][0] - 0.5f,
+	VkSampleLocations4[2][0] - 0.5f,
+	VkSampleLocations4[3][0] - 0.5f,
+};
+
+static constexpr float SampleLocationsY[4] = {
+	VkSampleLocations4[0][1] - 0.5f,
+	VkSampleLocations4[1][1] - 0.5f,
+	VkSampleLocations4[2][1] - 0.5f,
+	VkSampleLocations4[3][1] - 0.5f,
+};
+
+// Compute the yMin and yMax multisample offsets so that they are just
+// large enough (+/- max range - epsilon) to include sample points
+static constexpr int yMinMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) - sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+static constexpr int yMaxMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) + sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
+
 struct Constants
 {
 	Constants();
@@ -92,39 +122,20 @@
 	int Xf[4];
 	int Yf[4];
 
-	float4 X[4];
-	float4 Y[4];
-
-	// VK_SAMPLE_COUNT_4_BIT
-	// https://www.khronos.org/registry/vulkan/specs/1.1/html/vkspec.html#primsrast-multisampling
-	static constexpr float VkSampleLocations4[][2] = {
-		{ 0.375, 0.125 },
-		{ 0.875, 0.375 },
-		{ 0.125, 0.625 },
-		{ 0.625, 0.875 },
+	const float SampleLocationsX[4] = {
+		sw::SampleLocationsX[0],
+		sw::SampleLocationsX[1],
+		sw::SampleLocationsX[2],
+		sw::SampleLocationsX[3],
 	};
 
-	// Vulkan spec sample positions are relative to 0,0 in top left corner, with Y+ going down.
-	// Convert to our space, with 0,0 in center, and Y+ going up.
-	static constexpr float SampleLocationsX[4] = {
-		VkSampleLocations4[0][0] - 0.5f,
-		VkSampleLocations4[1][0] - 0.5f,
-		VkSampleLocations4[2][0] - 0.5f,
-		VkSampleLocations4[3][0] - 0.5f,
+	const float SampleLocationsY[4] = {
+		sw::SampleLocationsY[0],
+		sw::SampleLocationsY[1],
+		sw::SampleLocationsY[2],
+		sw::SampleLocationsY[3],
 	};
 
-	static constexpr float SampleLocationsY[4] = {
-		VkSampleLocations4[0][1] - 0.5f,
-		VkSampleLocations4[1][1] - 0.5f,
-		VkSampleLocations4[2][1] - 0.5f,
-		VkSampleLocations4[3][1] - 0.5f,
-	};
-
-	// Compute the yMin and yMax multisample offsets so that they are just
-	// large enough (+/- max range - epsilon) to include sample points
-	static constexpr int yMinMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) - sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
-	static constexpr int yMaxMultiSampleOffset = sw::toFixedPoint(1, vk::SUBPIXEL_PRECISION_BITS) + sw::toFixedPoint(sw::max(SampleLocationsY[0], SampleLocationsY[1], SampleLocationsY[2], SampleLocationsY[3]), vk::SUBPIXEL_PRECISION_BITS) - 1;
-
 	dword maxX[16];
 	dword maxY[16];
 	dword maxZ[16];

diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 44e478e..cfdf2d2 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp

@@ -83,8 +83,8 @@
 	//  the x and y components of FragCoord reflect the location of the center of the fragment."
 	if(state.sampleShadingEnabled && state.multiSampleCount > 1)
 	{
-		x0 = Constants::VkSampleLocations4[samples[0]][0];
-		y0 = Constants::VkSampleLocations4[samples[0]][1];
+		x0 = VkSampleLocations4[samples[0]][0];
+		y0 = VkSampleLocations4[samples[0]][1];
 		x1 = 1.0f + x0;
 		y1 = 1.0f + y0;
 	}
@@ -188,9 +188,9 @@
 		ASSERT(samples.size() == 1);
 		int sampleId = samples[0];
 		routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] =
-		    SIMD::Float((state.multiSampleCount > 1) ? Constants::VkSampleLocations4[sampleId][0] : 0.5f);
+		    SIMD::Float((state.multiSampleCount > 1) ? VkSampleLocations4[sampleId][0] : 0.5f);
 		routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] =
-		    SIMD::Float((state.multiSampleCount > 1) ? Constants::VkSampleLocations4[sampleId][1] : 0.5f);
+		    SIMD::Float((state.multiSampleCount > 1) ? VkSampleLocations4[sampleId][1] : 0.5f);
 	}
 
 	// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is

diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index f2c24f9..74fac88 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp

@@ -92,7 +92,6 @@
 
 		stencilTest(sBuffer, x, sMask, samples);
 
-		Float4 f;
 		Float4 rhwCentroid;
 
 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
@@ -105,14 +104,14 @@
 
 				if(state.enableMultiSampling)
 				{
-					x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
+					x -= Float4(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
 				}
 
 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
 
 				if(state.depthBias)
 				{
-					z[q] += *Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16);
+					z[q] += Float4(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
 				}
 
 				unclampedZ[q] = z[q];
@@ -194,8 +193,8 @@
 
 				if(perSampleShading && (state.multiSampleCount > 1))
 				{
-					xxxx += Constants::SampleLocationsX[samples[0]];
-					yyyy += Constants::SampleLocationsY[samples[0]];
+					xxxx += SampleLocationsX[samples[0]];
+					yyyy += SampleLocationsY[samples[0]];
 				}
 
 				int packedInterpolant = 0;
@@ -675,7 +674,7 @@
 
 	for(unsigned int q : samples)
 	{
-		Int4 coverage = CmpNLT(alpha, *Pointer<Float4>(data + a2c[q]));
+		Int4 coverage = CmpNLT(alpha, Float4(*Pointer<Float>(data + a2c[q])));
 		Int aMask = SignMask(coverage);
 		cMask[q] &= aMask;
 	}

diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 9309d88..ceb6e9e 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp

@@ -165,8 +165,8 @@
 
 		if(state.enableMultiSampling)
 		{
-			yMin = (yMin + Constants::yMinMultiSampleOffset) >> subPixB;
-			yMax = (yMax + Constants::yMaxMultiSampleOffset) >> subPixB;
+			yMin = (yMin + yMinMultiSampleOffset) >> subPixB;
+			yMax = (yMax + yMaxMultiSampleOffset) >> subPixB;
 		}
 		else
 		{
@@ -382,13 +382,9 @@
 		{
 			Float4 ABC = M[0] + M[1] + M[2];
 
-			Float4 A = ABC.x;
-			Float4 B = ABC.y;
-			Float4 C = ABC.z;
-
-			*Pointer<Float4>(primitive + OFFSET(Primitive, w.A), 16) = A;
-			*Pointer<Float4>(primitive + OFFSET(Primitive, w.B), 16) = B;
-			*Pointer<Float4>(primitive + OFFSET(Primitive, w.C), 16) = C;
+			*Pointer<Float>(primitive + OFFSET(Primitive, w.A)) = ABC.x;
+			*Pointer<Float>(primitive + OFFSET(Primitive, w.B)) = ABC.y;
+			*Pointer<Float>(primitive + OFFSET(Primitive, w.C)) = ABC.z;
 		}
 
 		if(state.interpolateZ)
@@ -424,9 +420,9 @@
 
 			C = z0 * *Pointer<Float>(data + OFFSET(DrawData, depthRange)) + *Pointer<Float>(data + OFFSET(DrawData, depthNear));
 
-			*Pointer<Float4>(primitive + OFFSET(Primitive, z.A), 16) = Float4(A);
-			*Pointer<Float4>(primitive + OFFSET(Primitive, z.B), 16) = Float4(B);
-			*Pointer<Float4>(primitive + OFFSET(Primitive, z.C), 16) = Float4(C);
+			*Pointer<Float>(primitive + OFFSET(Primitive, z.A)) = A;
+			*Pointer<Float>(primitive + OFFSET(Primitive, z.B)) = B;
+			*Pointer<Float>(primitive + OFFSET(Primitive, z.C)) = C;
 
 			Float bias = 0.0f;
 
@@ -485,7 +481,7 @@
 					bias = IfThenElse(clamp > 0.0f, Min(bias, clamp), Max(bias, clamp));
 				}
 
-				*Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16) = Float4(bias);
+				*Pointer<Float>(primitive + OFFSET(Primitive, zBias)) = bias;
 			}
 		}
 
@@ -544,24 +540,20 @@
 		Float4 B = i.yyyy * m[1];
 		Float4 C = i.zzzz * m[2];
 
-		C = A + B + C;
+		Float4 P = A + B + C;
 
-		A = C.xxxx;
-		B = C.yyyy;
-		C = C.zzzz;
-
-		*Pointer<Float4>(primitive + planeEquation + 0, 16) = A;
-		*Pointer<Float4>(primitive + planeEquation + 16, 16) = B;
-		*Pointer<Float4>(primitive + planeEquation + 32, 16) = C;
+		*Pointer<Float>(primitive + planeEquation + 0) = P.x;
+		*Pointer<Float>(primitive + planeEquation + 4) = P.y;
+		*Pointer<Float>(primitive + planeEquation + 8) = P.z;
 	}
 	else
 	{
 		int leadingVertex = OFFSET(Triangle, v0);
 		Float C = *Pointer<Float>(triangle + leadingVertex + attribute);
 
-		*Pointer<Float4>(primitive + planeEquation + 0, 16) = Float4(0, 0, 0, 0);
-		*Pointer<Float4>(primitive + planeEquation + 16, 16) = Float4(0, 0, 0, 0);
-		*Pointer<Float4>(primitive + planeEquation + 32, 16) = Float4(C);
+		*Pointer<Float>(primitive + planeEquation + 0) = 0;
+		*Pointer<Float>(primitive + planeEquation + 4) = 0;
+		*Pointer<Float>(primitive + planeEquation + 8) = C;
 	}
 }
 

diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index f2c43d3..e4e341c 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp

@@ -1049,14 +1049,6 @@
 			static constexpr int NUM_SAMPLES = 4;
 			ASSERT(state->getMultiSampleCount() == NUM_SAMPLES);
 
-			Array<Float> sampleX(NUM_SAMPLES);
-			Array<Float> sampleY(NUM_SAMPLES);
-			for(int i = 0; i < NUM_SAMPLES; ++i)
-			{
-				sampleX[i] = Constants::SampleLocationsX[i];
-				sampleY[i] = Constants::SampleLocationsY[i];
-			}
-
 			auto sampleOperand = Operand(this, state, paramId);
 			ASSERT(sampleOperand.componentCount == 1);
 
@@ -1067,8 +1059,8 @@
 			for(int i = 0; i < SIMD::Width; ++i)
 			{
 				Int sample = Extract(samples, i);
-				x = Insert(x, sampleX[sample], i);
-				y = Insert(y, sampleY[sample], i);
+				x = Insert(x, *Pointer<Float>(state->routine->constants + OFFSET(Constants, SampleLocationsX) + sample * sizeof(float)), i);
+				y = Insert(y, *Pointer<Float>(state->routine->constants + OFFSET(Constants, SampleLocationsY) + sample * sizeof(float)), i);
 			}
 		}
 
@@ -1110,10 +1102,11 @@
 		{
 			Int offset = ((Extract(ptr.dynamicOffsets, i) + ptr.staticOffsets[i]) >> offsetShift) + component;
 			Pointer<Byte> planeEquationI = planeEquation + (offset * sizeof(PlaneEquation));
-			A = Insert(A, Extract(*Pointer<SIMD::Float>(planeEquationI + OFFSET(PlaneEquation, A), 16), i), i);
-			B = Insert(B, Extract(*Pointer<SIMD::Float>(planeEquationI + OFFSET(PlaneEquation, B), 16), i), i);
-			C = Insert(C, Extract(*Pointer<SIMD::Float>(planeEquationI + OFFSET(PlaneEquation, C), 16), i), i);
+			A = Insert(A, *Pointer<Float>(planeEquationI + OFFSET(PlaneEquation, A)), i);
+			B = Insert(B, *Pointer<Float>(planeEquationI + OFFSET(PlaneEquation, B)), i);
+			C = Insert(C, *Pointer<Float>(planeEquationI + OFFSET(PlaneEquation, C)), i);
 		}
+
 		return ::Interpolate(x, y, rhw, A, B, C, state->routine->inputsInterpolation[packedInterpolant]);
 	}
 	else
@@ -1135,12 +1128,12 @@
 {
 	SIMD::Float A;
 	SIMD::Float B;
-	SIMD::Float C = *Pointer<SIMD::Float>(planeEquation + OFFSET(PlaneEquation, C), 16);
+	SIMD::Float C = *Pointer<Float>(planeEquation + OFFSET(PlaneEquation, C));
 
 	if(interpolation != SpirvRoutine::Flat)
 	{
-		A = *Pointer<SIMD::Float>(planeEquation + OFFSET(PlaneEquation, A), 16);
-		B = *Pointer<SIMD::Float>(planeEquation + OFFSET(PlaneEquation, B), 16);
+		A = *Pointer<Float>(planeEquation + OFFSET(PlaneEquation, A));
+		B = *Pointer<Float>(planeEquation + OFFSET(PlaneEquation, B));
 	}
 
 	return ::Interpolate(x, y, rhw, A, B, C, interpolation);

diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 4e9f04c..6e8d3cf 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp

@@ -601,8 +601,8 @@
 		Float4 rhw = Float4(1.0f) / w;
 
 		Vector4f proj;
-		proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
-		proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
+		proj.x = As<Float4>(RoundIntClamped(Float4(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * Float4(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
+		proj.y = As<Float4>(RoundIntClamped(Float4(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * Float4(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
 		proj.z = pos.z * rhw;
 		proj.w = rhw;
commit	1c8155a85edbc9c708043475b1833ddd899b1d17	[log] [tgz]
author	Nicolas Capens <nicolas.capens@gmail.com>	Wed Jun 29 03:13:02 2022 -0400
committer	Nicolas Capens <nicolascapens@google.com>	Thu Jun 30 13:34:25 2022 +0000
tree	1f7bbe3d05242a978bffbdc7b4c904d1bebf21ef
parent	25dda394503be16ef03c42fdf9a2987d3aba84cc [diff]