Use SIMD types throughout graphics and compute pipelines

sw::SIMD::Float currently aliases rr::Float4, so we can replace all of
the latter with the former where we intend to support scaling to wider
SIMD vectors.

Bug: b/237494823
Change-Id: I04593aee136456d509b41ec9a977ee19fea1c268
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/66808
Tested-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index e2848c3..b69945e 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -122,20 +122,20 @@
 			x1 = Max(x1, Max(x1a, x1b));
 		}
 
-		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
+		SIMD::Float yyyy = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
 
 		if(interpolateZ())
 		{
 			for(unsigned int q = 0; q < state.multiSampleCount; q++)
 			{
-				Float4 y = yyyy;
+				SIMD::Float y = yyyy;
 
 				if(state.enableMultiSampling)
 				{
-					y += Float4(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsY) + q * sizeof(float)));
+					y += SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsY) + q * sizeof(float)));
 				}
 
-				Dz[q] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, z.C))) + y * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, z.B)));
+				Dz[q] = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, z.C))) + y * SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, z.B)));
 			}
 		}
 
@@ -143,7 +143,7 @@
 		{
 			if(interpolateW())
 			{
-				Dw = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, w.C))) + yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, w.B)));
+				Dw = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, w.C))) + yyyy * SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, w.B)));
 			}
 
 			if(spirvShader)
@@ -153,11 +153,11 @@
 				{
 					if(spirvShader->inputs[interfaceInterpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 					{
-						Dv[interfaceInterpolant] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].C)));
+						Dv[interfaceInterpolant] = *Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].C));
 						if(!spirvShader->inputs[interfaceInterpolant].Flat)
 						{
 							Dv[interfaceInterpolant] +=
-							    yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].B)));
+							    yyyy * SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, V[packedInterpolant].B)));
 						}
 						packedInterpolant++;
 					}
@@ -165,14 +165,14 @@
 
 				for(unsigned int i = 0; i < state.numClipDistances; i++)
 				{
-					DclipDistance[i] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].C))) +
-					                   yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].B)));
+					DclipDistance[i] = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].C))) +
+					                   yyyy * SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, clipDistance[i].B)));
 				}
 
 				for(unsigned int i = 0; i < state.numCullDistances; i++)
 				{
-					DcullDistance[i] = Float4(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].C))) +
-					                   yyyy * Float4(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].B)));
+					DcullDistance[i] = SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].C))) +
+					                   yyyy * SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, cullDistance[i].B)));
 				}
 			}
 
@@ -230,14 +230,14 @@
 	Until(y >= yMax);
 }
 
-Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
+SIMD::Float QuadRasterizer::interpolate(SIMD::Float &x, SIMD::Float &D, SIMD::Float &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
 {
 	if(flat)
 	{
 		return D;
 	}
 
-	Float4 interpolant = mulAdd(x, Float4(*Pointer<Float>(planeEquation + OFFSET(PlaneEquation, A))), D);
+	SIMD::Float interpolant = mulAdd(x, SIMD::Float(*Pointer<Float>(planeEquation + OFFSET(PlaneEquation, A))), D);
 
 	if(perspective)
 	{
diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
index 2f19d1d..9ef2ee1 100644
--- a/src/Device/QuadRasterizer.hpp
+++ b/src/Device/QuadRasterizer.hpp
@@ -33,12 +33,12 @@
 protected:
 	Pointer<Byte> constants;
 
-	Float4 Dz[4];
-	Float4 Dw;
-	Float4 Dv[MAX_INTERFACE_COMPONENTS];
-	Float4 Df;
-	Float4 DclipDistance[MAX_CLIP_DISTANCES];
-	Float4 DcullDistance[MAX_CULL_DISTANCES];
+	SIMD::Float Dz[4];
+	SIMD::Float Dw;
+	SIMD::Float Dv[MAX_INTERFACE_COMPONENTS];
+	SIMD::Float Df;
+	SIMD::Float DclipDistance[MAX_CLIP_DISTANCES];
+	SIMD::Float DcullDistance[MAX_CULL_DISTANCES];
 
 	UInt occlusion;
 
@@ -46,7 +46,7 @@
 
 	bool interpolateZ() const;
 	bool interpolateW() const;
-	Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
+	SIMD::Float interpolate(SIMD::Float &x, SIMD::Float &D, SIMD::Float &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
 
 	const PixelProcessor::State &state;
 	const SpirvShader *const spirvShader;
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index cbdf4ce..d142bb8 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -106,7 +106,7 @@
 		localInvocationID[0] = idx;
 	}
 
-	Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[0], 0), workgroupID[1], 1), workgroupID[2], 2);
+	Int4 wgID = Insert(Insert(Insert(Int4(0), workgroupID[0], 0), workgroupID[1], 1), workgroupID[2], 2);
 	auto localBase = workgroupSize * wgID;
 	SIMD::Int globalInvocationID[3];
 	globalInvocationID[0] = SIMD::Int(Extract(localBase, 0)) + localInvocationID[0];
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index cfdf2d2..3405458 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -31,9 +31,10 @@
 {
 }
 
-// Union all cMask and return it as 4 booleans
-Int4 PixelProgram::maskAny(Int cMask[4], const SampleSet &samples)
+// Union all cMask and return it as Booleans
+SIMD::Int PixelProgram::maskAny(Int cMask[4], const SampleSet &samples)
 {
+	ASSERT(SIMD::Width == 4);
 	// See if at least 1 sample is used
 	Int maskUnion = 0;
 	for(unsigned int q : samples)
@@ -41,17 +42,18 @@
 		maskUnion |= cMask[q];
 	}
 
-	// Convert to 4 booleans
-	Int4 laneBits = Int4(1, 2, 4, 8);
-	Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
-	Int4 mask(maskUnion);
-	mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+	// Convert to Booleans
+	SIMD::Int laneBits = SIMD::Int(1, 2, 4, 8);
+	SIMD::Int laneShiftsToMSB = SIMD::Int(31, 30, 29, 28);
+	SIMD::Int mask(maskUnion);
+	mask = ((mask & laneBits) << laneShiftsToMSB) >> 31;
 	return mask;
 }
 
-// Union all cMask/sMask/zMask and return it as 4 booleans
-Int4 PixelProgram::maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples)
+// Union all cMask/sMask/zMask and return it as Booleans
+SIMD::Int PixelProgram::maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples)
 {
+	ASSERT(SIMD::Width == 4);
 	// See if at least 1 sample is used
 	Int maskUnion = 0;
 	for(unsigned int q : samples)
@@ -59,15 +61,15 @@
 		maskUnion |= (cMask[q] & sMask[q] & zMask[q]);
 	}
 
-	// Convert to 4 booleans
-	Int4 laneBits = Int4(1, 2, 4, 8);
-	Int4 laneShiftsToMSB = Int4(31, 30, 29, 28);
-	Int4 mask(maskUnion);
-	mask = ((mask & laneBits) << laneShiftsToMSB) >> Int4(31);
+	// Convert to Booleans
+	SIMD::Int laneBits = SIMD::Int(1, 2, 4, 8);
+	SIMD::Int laneShiftsToMSB = SIMD::Int(31, 30, 29, 28);
+	SIMD::Int mask(maskUnion);
+	mask = ((mask & laneBits) << laneShiftsToMSB) >> 31;
 	return mask;
 }
 
-void PixelProgram::setBuiltins(Int &x, Int &y, Float4 (&z)[4], Float4 &w, Int cMask[4], const SampleSet &samples)
+void PixelProgram::setBuiltins(Int &x, Int &y, SIMD::Float (&z)[4], SIMD::Float &w, Int cMask[4], const SampleSet &samples)
 {
 	routine.setImmutableInputBuiltins(spirvShader);
 
@@ -148,28 +150,28 @@
 	if(it != spirvShader->inputBuiltins.end())
 	{
 		ASSERT(it->second.SizeInComponents == 1);
-		auto frontFacing = Int4(*Pointer<Int>(primitive + OFFSET(Primitive, clockwiseMask)));
-		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(frontFacing);
+		auto frontFacing = SIMD::Int(*Pointer<Int>(primitive + OFFSET(Primitive, clockwiseMask)));
+		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<SIMD::Float>(frontFacing);
 	}
 
 	it = spirvShader->inputBuiltins.find(spv::BuiltInSampleMask);
 	if(it != spirvShader->inputBuiltins.end())
 	{
-		static_assert(SIMD::Width == 4, "Expects SIMD width to be 4");
-		Int4 laneBits = Int4(1, 2, 4, 8);
+		ASSERT(SIMD::Width == 4);
+		SIMD::Int laneBits = SIMD::Int(1, 2, 4, 8);
 
-		Int4 inputSampleMask = 0;
+		SIMD::Int inputSampleMask = 0;
 		for(unsigned int q : samples)
 		{
-			inputSampleMask |= Int4(1 << q) & CmpNEQ(Int4(cMask[q]) & laneBits, Int4(0));
+			inputSampleMask |= SIMD::Int(1 << q) & CmpNEQ(SIMD::Int(cMask[q]) & laneBits, 0);
 		}
 
-		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<Float4>(inputSampleMask);
+		routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<SIMD::Float>(inputSampleMask);
 		// Sample mask input is an array, as the spec contemplates MSAA levels higher than 32.
 		// Fill any non-zero indices with 0.
 		for(auto i = 1u; i < it->second.SizeInComponents; i++)
 		{
-			routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = Float4(0);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = 0;
 		}
 	}
 
@@ -195,8 +197,8 @@
 
 	// Note: all lanes initially active to facilitate derivatives etc. Actual coverage is
 	// handled separately, through the cMask.
-	auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-	auto storesAndAtomicsMask = maskAny(cMask, sMask, zMask, samples);
+	SIMD::Int activeLaneMask = 0xFFFFFFFF;
+	SIMD::Int storesAndAtomicsMask = maskAny(cMask, sMask, zMask, samples);
 	routine.discardMask = 0;
 
 	spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets, state.multiSampleCount);
@@ -395,10 +397,10 @@
 		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
-			color[index].x = Min(Max(color[index].x, Float4(0.0f)), Float4(1.0f));
-			color[index].y = Min(Max(color[index].y, Float4(0.0f)), Float4(1.0f));
-			color[index].z = Min(Max(color[index].z, Float4(0.0f)), Float4(1.0f));
-			color[index].w = Min(Max(color[index].w, Float4(0.0f)), Float4(1.0f));
+			color[index].x = Min(Max(color[index].x, 0.0f), 1.0f);
+			color[index].y = Min(Max(color[index].y, 0.0f), 1.0f);
+			color[index].z = Min(Max(color[index].z, 0.0f), 1.0f);
+			color[index].w = Min(Max(color[index].w, 0.0f), 1.0f);
 			break;
 		case VK_FORMAT_R32_SFLOAT:
 		case VK_FORMAT_R32G32_SFLOAT:
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index 465d185..f367fee 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -31,7 +31,7 @@
 	virtual ~PixelProgram() {}
 
 protected:
-	virtual void setBuiltins(Int &x, Int &y, Float4 (&z)[4], Float4 &w, Int cMask[4], const SampleSet &samples);
+	virtual void setBuiltins(Int &x, Int &y, SIMD::Float (&z)[4], SIMD::Float &w, Int cMask[4], const SampleSet &samples);
 	virtual void executeShader(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples);
 	virtual Bool alphaTest(Int cMask[4], const SampleSet &samples);
 	virtual void blendColor(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4], const SampleSet &samples);
@@ -43,8 +43,8 @@
 	// Raster operations
 	void clampColor(Vector4f color[MAX_COLOR_BUFFERS]);
 
-	static Int4 maskAny(Int cMask[4], const SampleSet &samples);
-	static Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples);
+	static SIMD::Int maskAny(Int cMask[4], const SampleSet &samples);
+	static SIMD::Int maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples);
 };
 
 }  // namespace sw
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 74fac88..05cc345 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -73,7 +73,7 @@
 
 	Int zMask[4];  // Depth mask
 	Int sMask[4];  // Stencil mask
-	Float4 unclampedZ[4];
+	SIMD::Float unclampedZ[4];
 
 	for(int invocation = 0; invocation < invocationCount; invocation++)
 	{
@@ -92,26 +92,26 @@
 
 		stencilTest(sBuffer, x, sMask, samples);
 
-		Float4 rhwCentroid;
+		SIMD::Float rhwCentroid;
 
-		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
+		SIMD::Float xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
 
 		if(interpolateZ())
 		{
 			for(unsigned int q : samples)
 			{
-				Float4 x = xxxx;
+				SIMD::Float x = xxxx;
 
 				if(state.enableMultiSampling)
 				{
-					x -= Float4(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
+					x -= SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
 				}
 
 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
 
 				if(state.depthBias)
 				{
-					z[q] += Float4(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
+					z[q] += SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
 				}
 
 				unclampedZ[q] = z[q];
@@ -140,21 +140,21 @@
 				occlusionSampleCount(zMask, sMask, samples);
 			}
 
-			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
+			SIMD::Float yyyy = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
 
 			// Centroid locations
-			Float4 XXXX = 0.0f;
-			Float4 YYYY = 0.0f;
+			SIMD::Float XXXX = 0.0f;
+			SIMD::Float YYYY = 0.0f;
 
 			if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
 			{
-				Float4 WWWW(1.0e-9f);
+				SIMD::Float WWWW = 1.0e-9f;
 
 				for(unsigned int q : samples)
 				{
-					XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
-					YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
-					WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
+					XXXX += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
+					YYYY += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
+					WWWW += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
 				}
 
 				WWWW = Rcp(WWWW, true /* relaxedPrecision */);
@@ -417,9 +417,9 @@
 	}
 }
 
-Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
 {
-	Float4 Z = z;
+	SIMD::Float Z = z;
 
 	Pointer<Byte> buffer = zBuffer + 4 * x;
 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
@@ -429,14 +429,14 @@
 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
 	}
 
-	Float4 zValue;
+	SIMD::Float zValue;
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
 	}
 
-	Int4 zTest;
+	SIMD::Int zTest;
 
 	switch(state.depthCompareMode)
 	{
@@ -489,7 +489,7 @@
 	return zMask != 0;
 }
 
-Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
 {
 	Short4 Z = convertFixed16(z, true);
 
@@ -566,7 +566,7 @@
 	return zMask != 0;
 }
 
-Float4 PixelRoutine::clampDepth(const Float4 &z)
+SIMD::Float PixelRoutine::clampDepth(const SIMD::Float &z)
 {
 	if(!state.depthClamp)
 	{
@@ -576,7 +576,7 @@
 	return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
 }
 
-Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
+Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
 {
 	if(!state.depthTestActive)
 	{
@@ -663,7 +663,7 @@
 	}
 }
 
-void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples)
+void PixelRoutine::alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples)
 {
 	static const int a2c[4] = {
 		OFFSET(DrawData, a2c0),
@@ -674,7 +674,7 @@
 
 	for(unsigned int q : samples)
 	{
-		Int4 coverage = CmpNLT(alpha, Float4(*Pointer<Float>(data + a2c[q])));
+		SIMD::Int coverage = CmpNLT(alpha, SIMD::Float(*Pointer<Float>(data + a2c[q])));
 		Int aMask = SignMask(coverage);
 		cMask[q] &= aMask;
 	}
@@ -1919,7 +1919,7 @@
 	}
 }
 
-void PixelRoutine::blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
+void PixelRoutine::blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
 {
 	switch(alphaBlendFactor)
 	{
@@ -1984,103 +1984,103 @@
 	}
 }
 
-Float4 PixelRoutine::blendOpOverlay(Float4 &src, Float4 &dst)
+SIMD::Float PixelRoutine::blendOpOverlay(SIMD::Float &src, SIMD::Float &dst)
 {
-	Int4 largeDst = CmpGT(dst, 0.5f);
-	return As<Float4>(
-	    (~largeDst & As<Int4>(2.0f * src * dst)) |
-	    (largeDst & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
+	SIMD::Int largeDst = CmpGT(dst, 0.5f);
+	return As<SIMD::Float>(
+	    (~largeDst & As<SIMD::Int>(2.0f * src * dst)) |
+	    (largeDst & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
 }
 
-Float4 PixelRoutine::blendOpColorDodge(Float4 &src, Float4 &dst)
+SIMD::Float PixelRoutine::blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst)
 {
-	Int4 srcBelowOne = CmpLT(src, 1.0f);
-	Int4 positiveDst = CmpGT(dst, 0.0f);
-	return As<Float4>(positiveDst & ((~srcBelowOne & As<Int4>(Float4(1.0f))) |
-	                                 (srcBelowOne & As<Int4>(Min(1.0f, (dst / (1.0f - src)))))));
+	SIMD::Int srcBelowOne = CmpLT(src, 1.0f);
+	SIMD::Int positiveDst = CmpGT(dst, 0.0f);
+	return As<SIMD::Float>(positiveDst & ((~srcBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
+	                                      (srcBelowOne & As<SIMD::Int>(Min(1.0f, (dst / (1.0f - src)))))));
 }
 
-Float4 PixelRoutine::blendOpColorBurn(Float4 &src, Float4 &dst)
+SIMD::Float PixelRoutine::blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst)
 {
-	Int4 dstBelowOne = CmpLT(dst, 1.0f);
-	Int4 positiveSrc = CmpGT(src, 0.0f);
-	return As<Float4>(
-	    (~dstBelowOne & As<Int4>(Float4(1.0f))) |
-	    (dstBelowOne & positiveSrc & As<Int4>(1.0f - Min(1.0f, (1.0f - dst) / src))));
+	SIMD::Int dstBelowOne = CmpLT(dst, 1.0f);
+	SIMD::Int positiveSrc = CmpGT(src, 0.0f);
+	return As<SIMD::Float>(
+	    (~dstBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
+	    (dstBelowOne & positiveSrc & As<SIMD::Int>(1.0f - Min(1.0f, (1.0f - dst) / src))));
 }
 
-Float4 PixelRoutine::blendOpHardlight(Float4 &src, Float4 &dst)
+SIMD::Float PixelRoutine::blendOpHardlight(SIMD::Float &src, SIMD::Float &dst)
 {
-	Int4 largeSrc = CmpGT(src, 0.5f);
-	return As<Float4>(
-	    (~largeSrc & As<Int4>(2.0f * src * dst)) |
-	    (largeSrc & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
+	SIMD::Int largeSrc = CmpGT(src, 0.5f);
+	return As<SIMD::Float>(
+	    (~largeSrc & As<SIMD::Int>(2.0f * src * dst)) |
+	    (largeSrc & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
 }
 
-Float4 PixelRoutine::blendOpSoftlight(Float4 &src, Float4 &dst)
+SIMD::Float PixelRoutine::blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst)
 {
-	Int4 largeSrc = CmpGT(src, 0.5f);
-	Int4 largeDst = CmpGT(dst, 0.25f);
+	SIMD::Int largeSrc = CmpGT(src, 0.5f);
+	SIMD::Int largeDst = CmpGT(dst, 0.25f);
 
-	return As<Float4>(
-	    (~largeSrc & As<Int4>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
-	    (largeSrc & ((~largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
-	                 (largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
+	return As<SIMD::Float>(
+	    (~largeSrc & As<SIMD::Int>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
+	    (largeSrc & ((~largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
+	                 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
 }
 
-Float4 PixelRoutine::maxRGB(Vector4f &c)
+SIMD::Float PixelRoutine::maxRGB(Vector4f &c)
 {
 	return Max(Max(c.x, c.y), c.z);
 }
 
-Float4 PixelRoutine::minRGB(Vector4f &c)
+SIMD::Float PixelRoutine::minRGB(Vector4f &c)
 {
 	return Min(Min(c.x, c.y), c.z);
 }
 
-void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
+void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
 {
-	Float4 minbase = minRGB(cbase);
-	Float4 sbase = maxRGB(cbase) - minbase;
-	Float4 ssat = maxRGB(csat) - minRGB(csat);
-	Int4 isNonZero = CmpGT(sbase, 0.0f);
+	SIMD::Float minbase = minRGB(cbase);
+	SIMD::Float sbase = maxRGB(cbase) - minbase;
+	SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
+	SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
 	Vector4f color;
-	color.x = As<Float4>(isNonZero & As<Int4>((cbase.x - minbase) * ssat / sbase));
-	color.y = As<Float4>(isNonZero & As<Int4>((cbase.y - minbase) * ssat / sbase));
-	color.z = As<Float4>(isNonZero & As<Int4>((cbase.z - minbase) * ssat / sbase));
+	color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
+	color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
+	color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
 	setLum(color, clum, x, y, z);
 }
 
-Float4 PixelRoutine::lumRGB(Vector4f &c)
+SIMD::Float PixelRoutine::lumRGB(Vector4f &c)
 {
 	return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
 }
 
-Float4 PixelRoutine::computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne)
+SIMD::Float PixelRoutine::computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne)
 {
-	return As<Float4>(
-	    (negative & As<Int4>(lum + ((color - lum) * lum) / (lum - mincol))) |
-	    (~negative & ((aboveOne & As<Int4>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
-	                  (~aboveOne & As<Int4>(color)))));
+	return As<SIMD::Float>(
+	    (negative & As<SIMD::Int>(lum + ((color - lum) * lum) / (lum - mincol))) |
+	    (~negative & ((aboveOne & As<SIMD::Int>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
+	                  (~aboveOne & As<SIMD::Int>(color)))));
 }
 
-void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
+void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
 {
-	Float4 lbase = lumRGB(cbase);
-	Float4 llum = lumRGB(clum);
-	Float4 ldiff = llum - lbase;
+	SIMD::Float lbase = lumRGB(cbase);
+	SIMD::Float llum = lumRGB(clum);
+	SIMD::Float ldiff = llum - lbase;
 
 	Vector4f color;
 	color.x = cbase.x + ldiff;
 	color.y = cbase.y + ldiff;
 	color.z = cbase.z + ldiff;
 
-	Float4 lum = lumRGB(color);
-	Float4 mincol = minRGB(color);
-	Float4 maxcol = maxRGB(color);
+	SIMD::Float lum = lumRGB(color);
+	SIMD::Float mincol = minRGB(color);
+	SIMD::Float maxcol = maxRGB(color);
 
-	Int4 negative = CmpLT(mincol, 0.0f);
-	Int4 aboveOne = CmpGT(maxcol, 1.0f);
+	SIMD::Int negative = CmpLT(mincol, 0.0f);
+	SIMD::Int aboveOne = CmpGT(maxcol, 1.0f);
 
 	x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
 	y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
@@ -2089,10 +2089,10 @@
 
 void PixelRoutine::premultiply(Vector4f &c)
 {
-	Int4 nonZeroAlpha = CmpNEQ(c.w, 0.0f);
-	c.x = As<Float4>(nonZeroAlpha & As<Int4>(c.x / c.w));
-	c.y = As<Float4>(nonZeroAlpha & As<Int4>(c.y / c.w));
-	c.z = As<Float4>(nonZeroAlpha & As<Int4>(c.z / c.w));
+	SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
+	c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
+	c.y = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.y / c.w));
+	c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
 }
 
 Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
@@ -2188,7 +2188,7 @@
 		break;
 	}
 
-	Float4 p = srcColor.w * dstColor.w;
+	SIMD::Float p = srcColor.w * dstColor.w;
 	blendedColor.x *= p;
 	blendedColor.y *= p;
 	blendedColor.z *= p;
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index 855021e..229455d 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -37,14 +37,14 @@
 protected:
 	using SampleSet = std::vector<int>;
 
-	Float4 z[4];  // Multisampled z
-	Float4 w;     // Used as is
-	Float4 rhw;   // Reciprocal w
+	SIMD::Float z[4];  // Multisampled z
+	SIMD::Float w;     // Used as is
+	SIMD::Float rhw;   // Reciprocal w
 
 	SpirvRoutine routine;
 	const vk::DescriptorSet::Bindings &descriptorSets;
 
-	virtual void setBuiltins(Int &x, Int &y, Float4 (&z)[4], Float4 &w, Int cMask[4], const SampleSet &samples) = 0;
+	virtual void setBuiltins(Int &x, Int &y, SIMD::Float (&z)[4], SIMD::Float &w, Int cMask[4], const SampleSet &samples) = 0;
 	virtual void executeShader(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples) = 0;
 	virtual Bool alphaTest(Int cMask[4], const SampleSet &samples) = 0;
 	virtual void blendColor(Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4], const SampleSet &samples) = 0;
@@ -52,7 +52,7 @@
 	void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) override;
 
 	void alphaTest(Int &aMask, const Short4 &alpha);
-	void alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples);
+	void alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples);
 
 	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask);
 	Vector4f alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x);
@@ -70,29 +70,31 @@
 	void stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack);
 	Byte8 stencilOperation(const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask);
 	Byte8 stencilOperation(const Byte8 &bufferValue, VkStencilOp operation, bool isBack);
-	Float4 clampDepth(const Float4 &z);
-	Bool depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
+	SIMD::Float clampDepth(const SIMD::Float &z);
+	Bool depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask);
 	void depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask);
 
 	void readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel);
 	enum BlendFactorModifier { None, OneMinus };
 	Float blendConstant(vk::Format format, int component, BlendFactorModifier modifier = None);
 	void blendFactorRGB(Vector4f &blendFactorRGB, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format);
-	void blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format);
+	void blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format);
+
 	bool blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format);
 	Vector4f computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor);
-	Float4 blendOpOverlay(Float4 &src, Float4 &dst);
-	Float4 blendOpColorDodge(Float4 &src, Float4 &dst);
-	Float4 blendOpColorBurn(Float4 &src, Float4 &dst);
-	Float4 blendOpHardlight(Float4 &src, Float4 &dst);
-	Float4 blendOpSoftlight(Float4 &src, Float4 &dst);
-	void setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z);
-	void setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z);
-	Float4 computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne);
-	Float4 maxRGB(Vector4f &c);
-	Float4 minRGB(Vector4f &c);
-	Float4 lumRGB(Vector4f &c);
+	SIMD::Float blendOpOverlay(SIMD::Float &src, SIMD::Float &dst);
+	SIMD::Float blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst);
+	SIMD::Float blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst);
+	SIMD::Float blendOpHardlight(SIMD::Float &src, SIMD::Float &dst);
+	SIMD::Float blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst);
+	void setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+	void setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+	SIMD::Float computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne);
+	SIMD::Float maxRGB(Vector4f &c);
+	SIMD::Float minRGB(Vector4f &c);
+	SIMD::Float lumRGB(Vector4f &c);
 	void premultiply(Vector4f &c);
+
 	void writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples);
 	void writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples);
 	void occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples);
@@ -101,8 +103,8 @@
 	void linearToSRGB16_12_16(Vector4s &c);
 	Float4 sRGBtoLinear(const Float4 &x);
 
-	Bool depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
-	Bool depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask);
+	Bool depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask);
+	Bool depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask);
 
 	void writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
 	void writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask);
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index d700a5d..3162e29 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -27,7 +27,8 @@
     , function(function)
 {
 }
-Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 uvwa[4], Float4 &dRef, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4i &offset, Int4 &sample)
+
+Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample)
 {
 	Vector4f c;
 
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index 757dc7c..632f894 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -61,7 +61,7 @@
 public:
 	SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function);
 
-	Vector4f sampleTexture(Pointer<Byte> &texture, Float4 uvwa[4], Float4 &q, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4i &offset, Int4 &sample);
+	Vector4f sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample);
 
 private:
 	Float4 applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer);
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index a01759a..5d8ddae 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -1027,7 +1027,7 @@
 	static bool IsStorageInterleavedByLane(spv::StorageClass storageClass);
 	static bool IsExplicitLayout(spv::StorageClass storageClass);
 
-	static sw::SIMD::Pointer GetElementPointer(sw::SIMD::Pointer structure, uint32_t offset, bool interleavedByLane);
+	static SIMD::Pointer GetElementPointer(SIMD::Pointer structure, uint32_t offset, bool interleavedByLane);
 
 	// Output storage buffers and images should not be affected by helper invocations
 	static bool StoresInHelperInvocation(spv::StorageClass storageClass);
@@ -1419,8 +1419,8 @@
 		AtSample,
 		AtOffset,
 	};
-	SIMD::Float Interpolate(SIMD::Pointer const &ptr, int32_t location, Object::ID paramId,
-	                        uint32_t component, EmitState *state, InterpolationType type) const;
+	SIMD::Float EmitInterpolate(SIMD::Pointer const &ptr, int32_t location, Object::ID paramId,
+	                            uint32_t component, EmitState *state, InterpolationType type) const;
 
 	// Helper for implementing OpStore, which doesn't take an InsnIterator so it
 	// can also store independent operands.
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index e4e341c..bbbd951 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -21,20 +21,21 @@
 #include <spirv/unified1/GLSL.std.450.h>
 #include <spirv/unified1/spirv.hpp>
 
-namespace {
-constexpr float PI = 3.141592653589793f;
+namespace sw {
 
-sw::SIMD::Float Interpolate(const sw::SIMD::Float &x, const sw::SIMD::Float &y, const sw::SIMD::Float &rhw,
-                            const sw::SIMD::Float &A, const sw::SIMD::Float &B, const sw::SIMD::Float &C,
-                            sw::SpirvRoutine::Interpolation interpolation)
+static constexpr float PI = 3.141592653589793f;
+
+static SIMD::Float Interpolate(const SIMD::Float &x, const SIMD::Float &y, const SIMD::Float &rhw,
+                               const SIMD::Float &A, const SIMD::Float &B, const SIMD::Float &C,
+                               SpirvRoutine::Interpolation interpolation)
 {
-	sw::SIMD::Float interpolant = C;
+	SIMD::Float interpolant = C;
 
-	if(interpolation != sw::SpirvRoutine::Flat)
+	if(interpolation != SpirvRoutine::Flat)
 	{
 		interpolant += x * A + y * B;
 
-		if(interpolation == sw::SpirvRoutine::Perspective)
+		if(interpolation == SpirvRoutine::Perspective)
 		{
 			interpolant *= rhw;
 		}
@@ -43,10 +44,6 @@
 	return interpolant;
 }
 
-}  // namespace
-
-namespace sw {
-
 SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitState *state) const
 {
 	auto &type = getType(insn.resultTypeId());
@@ -135,7 +132,7 @@
 				auto x = Round(src.Float(i));
 				// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
 				dst.move(i, x + ((SIMD::Float(CmpLT(x, src.Float(i)) & SIMD::Int(1)) * SIMD::Float(2.0f)) - SIMD::Float(1.0f)) *
-				                    SIMD::Float(CmpEQ(Frac(src.Float(i)), SIMD::Float(0.5f)) & SIMD::Int(1)) * SIMD::Float(Int4(x) & SIMD::Int(1)));
+				                    SIMD::Float(CmpEQ(Frac(src.Float(i)), SIMD::Float(0.5f)) & SIMD::Int(1)) * SIMD::Float(SIMD::Int(x) & SIMD::Int(1)));
 			}
 		}
 		break;
@@ -216,11 +213,8 @@
 			auto x = Operand(this, state, insn.word(7));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				auto tx = Min(Max((x.Float(i) - edge0.Float(i)) /
-				                      (edge1.Float(i) - edge0.Float(i)),
-				                  SIMD::Float(0.0f)),
-				              SIMD::Float(1.0f));
-				dst.move(i, tx * tx * (Float4(3.0f) - Float4(2.0f) * tx));
+				auto tx = Min(Max((x.Float(i) - edge0.Float(i)) / (edge1.Float(i) - edge0.Float(i)), 0.0f), 1.0f);
+				dst.move(i, tx * tx * (3.0f - 2.0f * tx));
 			}
 		}
 		break;
@@ -602,7 +596,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Sin(radians.Float(i), d.RelaxedPrecision));
+				dst.move(i, Sin(radians.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -613,7 +607,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Cos(radians.Float(i), d.RelaxedPrecision));
+				dst.move(i, Cos(radians.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -624,7 +618,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Tan(radians.Float(i), d.RelaxedPrecision));
+				dst.move(i, Tan(radians.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -635,7 +629,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Asin(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Asin(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -646,7 +640,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Acos(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Acos(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -657,7 +651,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Atan(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Atan(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -668,7 +662,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Sinh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Sinh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -679,7 +673,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Cosh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Cosh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -690,7 +684,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Tanh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Tanh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -701,7 +695,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Asinh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Asinh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -712,7 +706,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Acosh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Acosh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -723,7 +717,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Atanh(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Atanh(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -735,7 +729,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Atan2(x.Float(i), y.Float(i), d.RelaxedPrecision));
+				dst.move(i, Atan2(x.Float(i), y.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -747,7 +741,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Pow(x.Float(i), y.Float(i), d.RelaxedPrecision));
+				dst.move(i, Pow(x.Float(i), y.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -758,7 +752,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Exp(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Exp(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -769,7 +763,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Log(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Log(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -780,7 +774,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Exp2(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Exp2(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -791,7 +785,7 @@
 
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, sw::Log2(val.Float(i), d.RelaxedPrecision));
+				dst.move(i, Log2(val.Float(i), d.RelaxedPrecision));
 			}
 		}
 		break;
@@ -943,7 +937,7 @@
 			auto ptr = state->getPointer(insn.word(5));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, Interpolate(ptr, d.Location, 0, i, state, SpirvShader::Centroid));
+				dst.move(i, EmitInterpolate(ptr, d.Location, 0, i, state, SpirvShader::Centroid));
 			}
 		}
 		break;
@@ -953,7 +947,7 @@
 			auto ptr = state->getPointer(insn.word(5));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, Interpolate(ptr, d.Location, insn.word(6), i, state, SpirvShader::AtSample));
+				dst.move(i, EmitInterpolate(ptr, d.Location, insn.word(6), i, state, SpirvShader::AtSample));
 			}
 		}
 		break;
@@ -963,7 +957,7 @@
 			auto ptr = state->getPointer(insn.word(5));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, Interpolate(ptr, d.Location, insn.word(6), i, state, SpirvShader::AtOffset));
+				dst.move(i, EmitInterpolate(ptr, d.Location, insn.word(6), i, state, SpirvShader::AtOffset));
 			}
 		}
 		break;
@@ -1007,8 +1001,8 @@
 	return EmitResult::Continue;
 }
 
-SIMD::Float SpirvShader::Interpolate(SIMD::Pointer const &ptr, int32_t location, Object::ID paramId,
-                                     uint32_t component, EmitState *state, InterpolationType type) const
+SIMD::Float SpirvShader::EmitInterpolate(SIMD::Pointer const &ptr, int32_t location, Object::ID paramId,
+                                         uint32_t component, EmitState *state, InterpolationType type) const
 {
 	uint32_t interpolant = (location * 4);
 	uint32_t components_per_row = GetNumInputComponents(location);
@@ -1056,7 +1050,7 @@
 			// input variable is undefined, so we just clamp to avoid OOB accesses.
 			SIMD::Int samples = sampleOperand.Int(0) & SIMD::Int(NUM_SAMPLES - 1);
 
-			for(int i = 0; i < SIMD::Width; ++i)
+			for(int i = 0; i < SIMD::Width; i++)
 			{
 				Int sample = Extract(samples, i);
 				x = Insert(x, *Pointer<Float>(state->routine->constants + OFFSET(Constants, SampleLocationsX) + sample * sizeof(float)), i);
@@ -1098,7 +1092,7 @@
 		SIMD::Float B;
 		SIMD::Float C;
 
-		for(int i = 0; i < SIMD::Width; ++i)
+		for(int i = 0; i < SIMD::Width; i++)
 		{
 			Int offset = ((Extract(ptr.dynamicOffsets, i) + ptr.staticOffsets[i]) >> offsetShift) + component;
 			Pointer<Byte> planeEquationI = planeEquation + (offset * sizeof(PlaneEquation));
@@ -1107,7 +1101,7 @@
 			C = Insert(C, *Pointer<Float>(planeEquationI + OFFSET(PlaneEquation, C)), i);
 		}
 
-		return ::Interpolate(x, y, rhw, A, B, C, state->routine->inputsInterpolation[packedInterpolant]);
+		return Interpolate(x, y, rhw, A, B, C, state->routine->inputsInterpolation[packedInterpolant]);
 	}
 	else
 	{
@@ -1136,7 +1130,7 @@
 		B = *Pointer<Float>(planeEquation + OFFSET(PlaneEquation, B));
 	}
 
-	return ::Interpolate(x, y, rhw, A, B, C, interpolation);
+	return Interpolate(x, y, rhw, A, B, C, interpolation);
 }
 
 }  // namespace sw
\ No newline at end of file
diff --git a/src/Pipeline/SpirvShaderMemory.cpp b/src/Pipeline/SpirvShaderMemory.cpp
index b3caa83..9b27561 100644
--- a/src/Pipeline/SpirvShaderMemory.cpp
+++ b/src/Pipeline/SpirvShaderMemory.cpp
@@ -574,10 +574,10 @@
 {
 	if(interleavedByLane)
 	{
-		structure.staticOffsets[0] += 0 * sizeof(float);
-		structure.staticOffsets[1] += 1 * sizeof(float);
-		structure.staticOffsets[2] += 2 * sizeof(float);
-		structure.staticOffsets[3] += 3 * sizeof(float);
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			structure.staticOffsets[i] += i * sizeof(float);
+		}
 
 		return structure + offset * sw::SIMD::Width;
 	}
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 7583b12..5a3dc2a 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -80,7 +80,8 @@
 	}
 
 	auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
-	Int4 storesAndAtomicsMask = CmpGE(UInt4(vertexCount), UInt4(1, 2, 3, 4));
+	ASSERT(SIMD::Width == 4);
+	SIMD::Int storesAndAtomicsMask = CmpGE(SIMD::UInt(vertexCount), SIMD::UInt(1, 2, 3, 4));
 	spirvShader->emit(&routine, activeLaneMask, storesAndAtomicsMask, descriptorSets);
 
 	spirvShader->emitEpilog(&routine);
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 6e8d3cf..535dc97 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -124,10 +124,10 @@
 		auto posZ = pos[it->second.FirstComponent + 2];
 		auto posW = pos[it->second.FirstComponent + 3];
 
-		Int4 maxX = CmpLT(posW, posX);
-		Int4 maxY = CmpLT(posW, posY);
-		Int4 minX = CmpNLE(-posW, posX);
-		Int4 minY = CmpNLE(-posW, posY);
+		SIMD::Int maxX = CmpLT(posW, posX);
+		SIMD::Int maxY = CmpLT(posW, posY);
+		SIMD::Int minX = CmpNLE(-posW, posX);
+		SIMD::Int minY = CmpNLE(-posW, posY);
 
 		clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
@@ -135,18 +135,18 @@
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
 		if(state.depthClipEnable)
 		{
-			Int4 maxZ = CmpLT(posW, posZ);
-			Int4 minZ = CmpNLE(Float4(0.0f), posZ);
+			SIMD::Int maxZ = CmpLT(posW, posZ);
+			SIMD::Int minZ = CmpNLE(0.0f, posZ);
 			clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
 			clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
 		}
 
-		Float4 maxPos = As<Float4>(Int4(0x7F7FFFFF));
-		Int4 finiteX = CmpLE(Abs(posX), maxPos);
-		Int4 finiteY = CmpLE(Abs(posY), maxPos);
-		Int4 finiteZ = CmpLE(Abs(posZ), maxPos);
+		SIMD::Float maxPos = As<SIMD::Float>(SIMD::Int(0x7F7FFFFF));
+		SIMD::Int finiteX = CmpLE(Abs(posX), maxPos);
+		SIMD::Int finiteY = CmpLE(Abs(posY), maxPos);
+		SIMD::Int finiteZ = CmpLE(Abs(posZ), maxPos);
 
-		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
+		SIMD::Int finiteXYZ = finiteX & finiteY & finiteZ;
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
 	}
 }
@@ -597,12 +597,12 @@
 		pos.w = position[it->second.FirstComponent + 3];
 
 		// Projection and viewport transform.
-		Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
-		Float4 rhw = Float4(1.0f) / w;
+		SIMD::Float w = As<SIMD::Float>(As<SIMD::Int>(pos.w) | (As<SIMD::Int>(CmpEQ(pos.w, 0.0f)) & As<SIMD::Int>(SIMD::Float(1.0f))));
+		SIMD::Float rhw = 1.0f / w;
 
 		Vector4f proj;
-		proj.x = As<Float4>(RoundIntClamped(Float4(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * Float4(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
-		proj.y = As<Float4>(RoundIntClamped(Float4(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * Float4(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
+		proj.x = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
+		proj.y = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
 		proj.z = pos.z * rhw;
 		proj.w = rhw;