Replace sw::SIMD aliases of 4-wide vectors with rr::SIMD types

sw::SIMD::Float was an alias of rr::Float4, but is now rr::SIMD::Float.
Likewise for the Int and UInt counterparts. rr::Pointer4 has become
rr::SIMD::Pointer.

SIMD::Width was set to 4 for both backends so functionally nothing
changes and none of the asserts are triggered.

The new SIMD::Float4 type is equivalent to Vector4f except each
component is a SIMD vector. The latter is still used in places where
4-wide vectors are assumed, but should be eliminated in future changes.

Bug: b/214583550
Bug: b/236162233
Change-Id: Ib15ae2f9883b989b30de58fda16d7e24fdca4a1a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/66752
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 3405458..6eb60b0 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -302,13 +302,14 @@
 			{
 				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
 
-				Vector4f colorf = alphaBlend(index, buffer, c[index], x);
+				SIMD::Float4 colorf = alphaBlend(index, buffer, c[index], x);
 
+				ASSERT(SIMD::Width == 4);
 				Vector4s color;
-				color.x = convertFixed16(colorf.x, true);
-				color.y = convertFixed16(colorf.y, true);
-				color.z = convertFixed16(colorf.z, true);
-				color.w = convertFixed16(colorf.w, true);
+				color.x = convertFixed16(Extract128(colorf.x, 0), true);
+				color.y = convertFixed16(Extract128(colorf.y, 0), true);
+				color.z = convertFixed16(Extract128(colorf.z, 0), true);
+				color.w = convertFixed16(Extract128(colorf.w, 0), true);
 				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
 			}
 			break;
@@ -348,7 +349,13 @@
 			{
 				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
 
-				Vector4f color = alphaBlend(index, buffer, c[index], x);
+				SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
+				ASSERT(SIMD::Width == 4);
+				Vector4f color;
+				color.x = Extract128(C.x, 0);
+				color.y = Extract128(C.y, 0);
+				color.z = Extract128(C.z, 0);
+				color.w = Extract128(C.w, 0);
 				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
 			}
 			break;
@@ -358,7 +365,7 @@
 	}
 }
 
-void PixelProgram::clampColor(Vector4f color[MAX_COLOR_BUFFERS])
+void PixelProgram::clampColor(SIMD::Float4 color[MAX_COLOR_BUFFERS])
 {
 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors
 	//  are each clamped to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index f367fee..35904a4 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -38,10 +38,10 @@
 
 private:
 	// Color outputs
-	Vector4f c[MAX_COLOR_BUFFERS];
+	SIMD::Float4 c[MAX_COLOR_BUFFERS];
 
 	// Raster operations
-	void clampColor(Vector4f color[MAX_COLOR_BUFFERS]);
+	void clampColor(SIMD::Float4 color[MAX_COLOR_BUFFERS]);
 
 	static SIMD::Int maskAny(Int cMask[4], const SampleSet &samples);
 	static SIMD::Int maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples);
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 05cc345..ca86a18 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -140,6 +140,7 @@
 				occlusionSampleCount(zMask, sMask, samples);
 			}
 
+			ASSERT(SIMD::Width == 4);
 			SIMD::Float yyyy = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
 
 			// Centroid locations
@@ -152,6 +153,7 @@
 
 				for(unsigned int q : samples)
 				{
+					ASSERT(SIMD::Width == 4);
 					XXXX += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
 					YYYY += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
 					WWWW += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
@@ -421,6 +423,7 @@
 {
 	SIMD::Float Z = z;
 
+	ASSERT(SIMD::Width == 4);
 	Pointer<Byte> buffer = zBuffer + 4 * x;
 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
 
@@ -433,6 +436,7 @@
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
+		ASSERT(SIMD::Width == 4);
 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
 	}
 
@@ -491,7 +495,8 @@
 
 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
 {
-	Short4 Z = convertFixed16(z, true);
+	ASSERT(SIMD::Width == 4);
+	Short4 Z = convertFixed16(Extract128(z, 0), true);
 
 	Pointer<Byte> buffer = zBuffer + 2 * x;
 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
@@ -744,14 +749,15 @@
 
 	for(unsigned int q : samples)
 	{
+		ASSERT(SIMD::Width == 4);
 		switch(state.depthFormat)
 		{
 		case VK_FORMAT_D16_UNORM:
-			writeDepth16(zBuffer, q, x, z[q], zMask[q]);
+			writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
 			break;
 		case VK_FORMAT_D32_SFLOAT:
 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
-			writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
+			writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
 			break;
 		default:
 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
@@ -1814,7 +1820,7 @@
 	}
 }
 
-void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
+void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
 {
 	switch(colorBlendFactor)
 	{
@@ -2028,30 +2034,30 @@
 	                 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
 }
 
-SIMD::Float PixelRoutine::maxRGB(Vector4f &c)
+SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
 {
 	return Max(Max(c.x, c.y), c.z);
 }
 
-SIMD::Float PixelRoutine::minRGB(Vector4f &c)
+SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
 {
 	return Min(Min(c.x, c.y), c.z);
 }
 
-void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
+void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
 {
 	SIMD::Float minbase = minRGB(cbase);
 	SIMD::Float sbase = maxRGB(cbase) - minbase;
 	SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
 	SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
-	Vector4f color;
+	SIMD::Float4 color;
 	color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
 	color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
 	color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
 	setLum(color, clum, x, y, z);
 }
 
-SIMD::Float PixelRoutine::lumRGB(Vector4f &c)
+SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
 {
 	return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
 }
@@ -2064,13 +2070,13 @@
 	                  (~aboveOne & As<SIMD::Int>(color)))));
 }
 
-void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
+void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
 {
 	SIMD::Float lbase = lumRGB(cbase);
 	SIMD::Float llum = lumRGB(clum);
 	SIMD::Float ldiff = llum - lbase;
 
-	Vector4f color;
+	SIMD::Float4 color;
 	color.x = cbase.x + ldiff;
 	color.y = cbase.y + ldiff;
 	color.z = cbase.z + ldiff;
@@ -2087,7 +2093,7 @@
 	z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
 }
 
-void PixelRoutine::premultiply(Vector4f &c)
+void PixelRoutine::premultiply(SIMD::Float4 &c)
 {
 	SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
 	c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
@@ -2095,15 +2101,15 @@
 	c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
 }
 
-Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
+SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
 {
-	Vector4f srcColor = src;
+	SIMD::Float4 srcColor = src;
 	srcColor.x *= srcFactor.x;
 	srcColor.y *= srcFactor.y;
 	srcColor.z *= srcFactor.z;
 	srcColor.w *= srcFactor.w;
 
-	Vector4f dstColor = dst;
+	SIMD::Float4 dstColor = dst;
 	dstColor.x *= dstFactor.x;
 	dstColor.y *= dstFactor.y;
 	dstColor.z *= dstFactor.z;
@@ -2112,7 +2118,7 @@
 	premultiply(srcColor);
 	premultiply(dstColor);
 
-	Vector4f blendedColor;
+	SIMD::Float4 blendedColor;
 
 	switch(state.blendState[index].blendOperation)
 	{
@@ -2242,7 +2248,7 @@
 	}
 }
 
-Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
+SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
 {
 	if(!state.blendState[index].alphaBlendEnable)
 	{
@@ -2255,11 +2261,11 @@
 	Pointer<Byte> buffer = cBuffer;
 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
 
-	// destColor holds four texel color values.
+	// texelColor holds four texel color values.
 	// Note: Despite the type being Vector4f, the colors may be stored as
 	// integers. Half-floats are stored as full 32-bit floats.
 	// Non-float and non-fixed point formats are not alpha blended.
-	Vector4f destColor;
+	Vector4f texelColor;
 
 	switch(format)
 	{
@@ -2268,161 +2274,168 @@
 	case VK_FORMAT_R32_SFLOAT:
 		// FIXME: movlps
 		buffer += 4 * x;
-		destColor.x.x = *Pointer<Float>(buffer + 0);
-		destColor.x.y = *Pointer<Float>(buffer + 4);
+		texelColor.x.x = *Pointer<Float>(buffer + 0);
+		texelColor.x.y = *Pointer<Float>(buffer + 4);
 		buffer += pitchB;
 		// FIXME: movhps
-		destColor.x.z = *Pointer<Float>(buffer + 0);
-		destColor.x.w = *Pointer<Float>(buffer + 4);
-		destColor.y = destColor.z = destColor.w = 1.0f;
+		texelColor.x.z = *Pointer<Float>(buffer + 0);
+		texelColor.x.w = *Pointer<Float>(buffer + 4);
+		texelColor.y = texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R32G32_SINT:
 	case VK_FORMAT_R32G32_UINT:
 	case VK_FORMAT_R32G32_SFLOAT:
 		buffer += 8 * x;
-		destColor.x = *Pointer<Float4>(buffer, 16);
+		texelColor.x = *Pointer<Float4>(buffer, 16);
 		buffer += pitchB;
-		destColor.y = *Pointer<Float4>(buffer, 16);
-		destColor.z = destColor.x;
-		destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
-		destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
-		destColor.y = destColor.z;
-		destColor.z = destColor.w = 1.0f;
+		texelColor.y = *Pointer<Float4>(buffer, 16);
+		texelColor.z = texelColor.x;
+		texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
+		texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
+		texelColor.y = texelColor.z;
+		texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R32G32B32A32_SFLOAT:
 	case VK_FORMAT_R32G32B32A32_SINT:
 	case VK_FORMAT_R32G32B32A32_UINT:
 		buffer += 16 * x;
-		destColor.x = *Pointer<Float4>(buffer + 0, 16);
-		destColor.y = *Pointer<Float4>(buffer + 16, 16);
+		texelColor.x = *Pointer<Float4>(buffer + 0, 16);
+		texelColor.y = *Pointer<Float4>(buffer + 16, 16);
 		buffer += pitchB;
-		destColor.z = *Pointer<Float4>(buffer + 0, 16);
-		destColor.w = *Pointer<Float4>(buffer + 16, 16);
-		transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
+		texelColor.z = *Pointer<Float4>(buffer + 0, 16);
+		texelColor.w = *Pointer<Float4>(buffer + 16, 16);
+		transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
 		break;
 	case VK_FORMAT_R16_UNORM:
 		buffer += 2 * x;
-		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
-		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
+		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
+		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
 		buffer += pitchB;
-		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
-		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
-		destColor.x *= (1.0f / 0xFFFF);
-		destColor.y = destColor.z = destColor.w = 1.0f;
+		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
+		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
+		texelColor.x *= (1.0f / 0xFFFF);
+		texelColor.y = texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R16_SFLOAT:
 		buffer += 2 * x;
-		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
-		destColor.x.y = Float(*Pointer<Half>(buffer + 2));
+		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
+		texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
 		buffer += pitchB;
-		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
-		destColor.x.w = Float(*Pointer<Half>(buffer + 2));
-		destColor.y = destColor.z = destColor.w = 1.0f;
+		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
+		texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
+		texelColor.y = texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R16G16_UNORM:
 		buffer += 4 * x;
-		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
-		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
-		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
-		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
+		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
+		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
+		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
+		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
 		buffer += pitchB;
-		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
-		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
-		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
-		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
-		destColor.x *= (1.0f / 0xFFFF);
-		destColor.y *= (1.0f / 0xFFFF);
-		destColor.z = destColor.w = 1.0f;
+		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
+		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
+		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
+		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
+		texelColor.x *= (1.0f / 0xFFFF);
+		texelColor.y *= (1.0f / 0xFFFF);
+		texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R16G16_SFLOAT:
 		buffer += 4 * x;
-		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
-		destColor.y.x = Float(*Pointer<Half>(buffer + 2));
-		destColor.x.y = Float(*Pointer<Half>(buffer + 4));
-		destColor.y.y = Float(*Pointer<Half>(buffer + 6));
+		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
+		texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
+		texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
+		texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
 		buffer += pitchB;
-		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
-		destColor.y.z = Float(*Pointer<Half>(buffer + 2));
-		destColor.x.w = Float(*Pointer<Half>(buffer + 4));
-		destColor.y.w = Float(*Pointer<Half>(buffer + 6));
-		destColor.z = destColor.w = 1.0f;
+		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
+		texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
+		texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
+		texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
+		texelColor.z = texelColor.w = 1.0f;
 		break;
 	case VK_FORMAT_R16G16B16A16_UNORM:
 		buffer += 8 * x;
-		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
-		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
-		destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
-		destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
-		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
-		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
-		destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
-		destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
+		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
+		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
+		texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
+		texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
+		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
+		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
+		texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
+		texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
 		buffer += pitchB;
-		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
-		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
-		destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
-		destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
-		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
-		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
-		destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
-		destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
-		destColor.x *= (1.0f / 0xFFFF);
-		destColor.y *= (1.0f / 0xFFFF);
-		destColor.z *= (1.0f / 0xFFFF);
-		destColor.w *= (1.0f / 0xFFFF);
+		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
+		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
+		texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
+		texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
+		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
+		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
+		texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
+		texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
+		texelColor.x *= (1.0f / 0xFFFF);
+		texelColor.y *= (1.0f / 0xFFFF);
+		texelColor.z *= (1.0f / 0xFFFF);
+		texelColor.w *= (1.0f / 0xFFFF);
 		break;
 	case VK_FORMAT_R16G16B16A16_SFLOAT:
 		buffer += 8 * x;
-		destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
-		destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
-		destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
-		destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
-		destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
-		destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
-		destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
-		destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
+		texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
+		texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
+		texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
+		texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
+		texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
+		texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
+		texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
+		texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
 		buffer += pitchB;
-		destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
-		destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
-		destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
-		destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
-		destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
-		destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
-		destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
-		destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
+		texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
+		texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
+		texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
+		texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
+		texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
+		texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
+		texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
+		texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
 		break;
 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
 		buffer += 4 * x;
-		destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
-		destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
+		texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
+		texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
 		buffer += pitchB;
-		destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
-		destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
-		transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
-		destColor.w = 1.0f;
+		texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
+		texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
+		transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
+		texelColor.w = 1.0f;
 		break;
 	default:
 		{
 			// Attempt to read an integer based format and convert it to float
 			Vector4s color;
 			readPixel(index, cBuffer, x, color);
-			destColor.x = convertFloat32(As<UShort4>(color.x));
-			destColor.y = convertFloat32(As<UShort4>(color.y));
-			destColor.z = convertFloat32(As<UShort4>(color.z));
-			destColor.w = convertFloat32(As<UShort4>(color.w));
+			texelColor.x = convertFloat32(As<UShort4>(color.x));
+			texelColor.y = convertFloat32(As<UShort4>(color.y));
+			texelColor.z = convertFloat32(As<UShort4>(color.z));
+			texelColor.w = convertFloat32(As<UShort4>(color.w));
 		}
 		break;
 	}
 
-	Vector4f sourceFactor;
-	Vector4f destFactor;
+	ASSERT(SIMD::Width == 4);
+	SIMD::Float4 destColor;
+	destColor.x = texelColor.x;
+	destColor.y = texelColor.y;
+	destColor.z = texelColor.z;
+	destColor.w = texelColor.w;
+
+	SIMD::Float4 sourceFactor;
+	SIMD::Float4 destFactor;
 
 	blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
 	blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
 	blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
 	blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
 
-	Vector4f blendedColor;
+	SIMD::Float4 blendedColor;
 
 	switch(state.blendState[index].blendOperation)
 	{
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index 229455d..c00768e 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -55,7 +55,7 @@
 	void alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples);
 
 	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask);
-	Vector4f alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x);
+	SIMD::Float4 alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x);
 	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask);
 
 	bool isSRGB(int index) const;
@@ -77,23 +77,23 @@
 	void readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel);
 	enum BlendFactorModifier { None, OneMinus };
 	Float blendConstant(vk::Format format, int component, BlendFactorModifier modifier = None);
-	void blendFactorRGB(Vector4f &blendFactorRGB, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format);
+	void blendFactorRGB(SIMD::Float4 &blendFactorRGB, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format);
 	void blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format);
 
 	bool blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format);
-	Vector4f computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor);
+	SIMD::Float4 computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor);
 	SIMD::Float blendOpOverlay(SIMD::Float &src, SIMD::Float &dst);
 	SIMD::Float blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst);
 	SIMD::Float blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst);
 	SIMD::Float blendOpHardlight(SIMD::Float &src, SIMD::Float &dst);
 	SIMD::Float blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst);
-	void setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
-	void setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+	void setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+	void setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
 	SIMD::Float computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne);
-	SIMD::Float maxRGB(Vector4f &c);
-	SIMD::Float minRGB(Vector4f &c);
-	SIMD::Float lumRGB(Vector4f &c);
-	void premultiply(Vector4f &c);
+	SIMD::Float maxRGB(SIMD::Float4 &c);
+	SIMD::Float minRGB(SIMD::Float4 &c);
+	SIMD::Float lumRGB(SIMD::Float4 &c);
+	void premultiply(SIMD::Float4 &c);
 
 	void writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples);
 	void writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples);
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 3162e29..d9845ad 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -28,7 +28,35 @@
 {
 }
 
-Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample)
+SIMD::Float4 SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample)
+{
+	SIMD::Float4 c;
+
+	for(int i = 0; i < SIMD::Width / 4; i++)
+	{
+		Float4 uvwa128[4];
+		uvwa128[0] = Extract128(uvwa[0], i);
+		uvwa128[1] = Extract128(uvwa[1], i);
+		uvwa128[2] = Extract128(uvwa[2], i);
+		uvwa128[3] = Extract128(uvwa[3], i);
+
+		Vector4i offset128;
+		offset128[0] = Extract128(offset[0], i);
+		offset128[1] = Extract128(offset[1], i);
+		offset128[2] = Extract128(offset[2], i);
+		offset128[3] = Extract128(offset[3], i);
+
+		Vector4f c128 = sampleTexture128(texture, uvwa128, Extract128(dRef, i), lodOrBias, Extract128(dsx, i), Extract128(dsy, i), offset128, Extract128(sample, i));
+		c.x = Insert128(c.x, c128.x, i);
+		c.y = Insert128(c.y, c128.y, i);
+		c.z = Insert128(c.z, c128.z, i);
+		c.w = Insert128(c.w, c128.w, i);
+	}
+
+	return c;
+}
+
+Vector4f SamplerCore::sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample)
 {
 	Vector4f c;
 
@@ -797,7 +825,7 @@
 	return c_;
 }
 
-Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
+Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
 {
 	Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false);
 
@@ -821,7 +849,7 @@
 	return c;
 }
 
-Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
+Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
 {
 	Vector4f c;
 
@@ -879,7 +907,7 @@
 	return c;
 }
 
-Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
 {
 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
 	{
@@ -891,7 +919,7 @@
 	}
 }
 
-Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
 {
 	Vector4f c;
 
@@ -984,7 +1012,7 @@
 	return c;
 }
 
-Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
 {
 	Vector4f c;
 
@@ -1084,7 +1112,7 @@
 	return lod;
 }
 
-void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, const Float4 &dsx, const Float4 &dsy)
 {
 	Float4 dudxy;
 
@@ -1108,7 +1136,7 @@
 	lod = log2sqrt(lod);
 }
 
-void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float4 &dsx, const Float4 &dsy)
 {
 	Float4 duvdxy;
 
@@ -1156,7 +1184,7 @@
 	lod = log2sqrt(lod);  // log2(sqrt(lod))
 }
 
-void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M)
+void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M)
 {
 	Float4 dudxy, dvdxy, dsdxy;
 
@@ -1197,7 +1225,7 @@
 	lod = log2(lod);
 }
 
-void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float4 &dsx, const Float4 &dsy)
 {
 	Float4 dudxy, dvdxy, dsdxy;
 
@@ -1236,9 +1264,9 @@
 	// TODO: Comply with Vulkan recommendation:
 	// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
 
-	Int4 xn = CmpLT(x, Float4(0.0f));  // x < 0
-	Int4 yn = CmpLT(y, Float4(0.0f));  // y < 0
-	Int4 zn = CmpLT(z, Float4(0.0f));  // z < 0
+	Int4 xn = CmpLT(x, 0.0f);  // x < 0
+	Int4 yn = CmpLT(y, 0.0f);  // y < 0
+	Int4 zn = CmpLT(z, 0.0f);  // z < 0
 
 	Float4 absX = Abs(x);
 	Float4 absY = Abs(y);
@@ -1282,9 +1310,9 @@
 	// V = !yMajor ? -y : (n ^ z)
 	V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
 
-	M = reciprocal(M) * Float4(0.5f);
-	U = U * M + Float4(0.5f);
-	V = V * M + Float4(0.5f);
+	M = reciprocal(M) * 0.5f;
+	U = U * M + 0.5f;
+	V = V * M + 0.5f;
 
 	return face;
 }
@@ -1941,7 +1969,7 @@
 	return c;
 }
 
-Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
+Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
 {
 	Int4 valid;
 
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index 632f894..e85ca05 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -61,9 +61,11 @@
 public:
 	SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function);
 
-	Vector4f sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample);
+	SIMD::Float4 sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample);
 
 private:
+	Vector4f sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample);
+
 	Float4 applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer);
 	Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
 	Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
@@ -71,22 +73,22 @@
 	Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
 	Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
 	Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
-	Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
-	Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD);
-	Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
-	Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
-	Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
-	void computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &dsx, Float4 &dsy);
-	void computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float4 &dsx, Float4 &dsy);
-	void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M);
-	void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy);
+	Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
+	Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD);
+	Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+	Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+	Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+	void computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &u, const Float4 &dsx, const Float4 &dsy);
+	void computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, const Float4 &dsx, const Float4 &dsy);
+	void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M);
+	void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy);
 	Int4 cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
 	Short4 applyOffset(Short4 &uvw, Int4 &offset, const Int4 &whd, AddressingMode mode);
 	void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &cubeArrayLayer, Vector4i &offset, const Int4 &sample, const Pointer<Byte> &mipmap);
 	void computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap);
 	Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &w, const Short4 &cubeArrayLayer, Vector4i &offset, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
 	Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer);
-	Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &w, Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
+	Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &w, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
 	Vector4f replaceBorderTexel(const Vector4f &c, Int4 valid);
 	Pointer<Byte> selectMipmap(const Pointer<Byte> &texture, const Float &lod, bool secondLOD);
 	Short4 address(const Float4 &uvw, AddressingMode addressingMode, Pointer<Byte> &mipmap);
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index ced3ed6..c1a21c1 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -513,11 +513,79 @@
 	return Sqrt(x);  // TODO(b/222218659): Optimize for relaxed precision.
 }
 
+UInt4 halfToFloatBits(RValue<UInt4> halfBits)
+{
+	auto magic = UInt4(126 << 23);
+
+	auto sign16 = halfBits & UInt4(0x8000);
+	auto man16 = halfBits & UInt4(0x03FF);
+	auto exp16 = halfBits & UInt4(0x7C00);
+
+	auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
+	auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
+
+	auto sign32 = sign16 << 16;
+	auto man32 = man16 << 13;
+	auto exp32 = (exp16 + UInt4(0x1C000)) << 13;
+	auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
+
+	auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
+
+	return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
+}
+
+UInt4 floatToHalfBits(RValue<UInt4> floatBits, bool storeInUpperBits)
+{
+	UInt4 sign = floatBits & UInt4(0x80000000);
+	UInt4 abs = floatBits & UInt4(0x7FFFFFFF);
+
+	UInt4 normal = CmpNLE(abs, UInt4(0x38800000));
+
+	UInt4 mantissa = (abs & UInt4(0x007FFFFF)) | UInt4(0x00800000);
+	UInt4 e = UInt4(113) - (abs >> 23);
+	UInt4 denormal = CmpLT(e, UInt4(24)) & (mantissa >> e);
+
+	UInt4 base = (normal & abs) | (~normal & denormal);  // TODO: IfThenElse()
+
+	// float exponent bias is 127, half bias is 15, so adjust by -112
+	UInt4 bias = normal & UInt4(0xC8000000);
+
+	UInt4 rounded = base + bias + UInt4(0x00000FFF) + ((base >> 13) & UInt4(1));
+	UInt4 fp16u = rounded >> 13;
+
+	// Infinity
+	fp16u |= CmpNLE(abs, UInt4(0x47FFEFFF)) & UInt4(0x7FFF);
+
+	return storeInUpperBits ? (sign | (fp16u << 16)) : ((sign >> 16) | fp16u);
+}
+
+SIMD::Float linearToSRGB(const SIMD::Float &c)
+{
+	SIMD::Float lc = Min(c, 0.0031308f) * 12.92f;
+	SIMD::Float ec = MulAdd(1.055f, Pow<Mediump>(c, (1.0f / 2.4f)), -0.055f);  // TODO(b/149574741): Use a custom approximation.
+
+	return Max(lc, ec);
+}
+
+SIMD::Float sRGBtoLinear(const SIMD::Float &c)
+{
+	SIMD::Float lc = c * (1.0f / 12.92f);
+	SIMD::Float ec = Pow<Mediump>(MulAdd(c, 1.0f / 1.055f, 0.055f / 1.055f), 2.4f);  // TODO(b/149574741): Use a custom approximation.
+
+	SIMD::Int linear = CmpLT(c, 0.04045f);
+	return As<SIMD::Float>((linear & As<SIMD::Int>(lc)) | (~linear & As<SIMD::Int>(ec)));  // TODO: IfThenElse()
+}
+
 RValue<Float4> reciprocal(RValue<Float4> x, bool pp, bool exactAtPow2)
 {
 	return Rcp(x, pp, exactAtPow2);
 }
 
+RValue<SIMD::Float> reciprocal(RValue<SIMD::Float> x, bool pp, bool exactAtPow2)
+{
+	return Rcp(x, pp, exactAtPow2);
+}
+
 RValue<Float4> reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
 {
 	Float4 abs = x;
@@ -541,6 +609,24 @@
 	return MulAdd(x, y, z);
 }
 
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision)
+{
+	// TODO(b/214588983): Eliminate by using only the wide SIMD variant (or specialize or templatize the implementation).
+	SIMD::Float xx;
+	SIMD::Float yy;
+	xx = Insert128(xx, x, 0);
+	yy = Insert128(yy, y, 0);
+	return Extract128(Pow(xx, yy, relaxedPrecision), 0);
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision)
+{
+	// TODO(b/214588983): Eliminate by using only the wide SIMD variant (or specialize or templatize the implementation).
+	SIMD::Float xx;
+	xx = Insert128(xx, x, 0);
+	return Extract128(Sqrt(xx, relaxedPrecision), 0);
+}
+
 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
 {
 	Int2 tmp0 = UnpackHigh(row0, row1);
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index b02f767..f7937ff 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -76,19 +76,25 @@
 	Int4 w;
 };
 
-// SIMD contains types that represent multiple scalars packed into a single
-// vector data type. Types in the SIMD namespace provide a semantic hint
-// that the data should be treated as a per-execution-lane scalar instead of
-// a typical euclidean-style vector type.
 namespace SIMD {
 
-// Width is the number of per-lane scalars packed into each SIMD vector.
-static constexpr int Width = 4;
+using namespace rr::SIMD;
 
-using Float = rr::Float4;
-using Int = rr::Int4;
-using UInt = rr::UInt4;
-using Pointer = rr::Pointer4;
+struct Float4
+{
+	SIMD::Float x;
+	SIMD::Float y;
+	SIMD::Float z;
+	SIMD::Float w;
+};
+
+struct Int4
+{
+	SIMD::Int x;
+	SIMD::Int y;
+	SIMD::Int z;
+	SIMD::Int w;
+};
 
 }  // namespace SIMD
 
@@ -123,20 +129,39 @@
 };
 
 // clang-format off
-template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
-template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
-template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
+template<Precision precision> RValue<SIMD::Float> Pow(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
+template<> inline RValue<SIMD::Float> Pow<Highp>(RValue<SIMD::Float> x, RValue<SIMD::Float> y) { return Pow(x, y, false); }
+template<> inline RValue<SIMD::Float> Pow<Mediump>(RValue<SIMD::Float> x, RValue<SIMD::Float> y) { return Pow(x, y, true); }
 
-template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
-template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
+template<Precision precision> RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x);
+template<> inline RValue<SIMD::Float> Sqrt<Highp>(RValue<SIMD::Float> x) { return Sqrt(x, false); }
+template<> inline RValue<SIMD::Float> Sqrt<Mediump>(RValue<SIMD::Float> x) { return Sqrt(x, true); }
 // clang-format on
 
+SIMD::UInt halfToFloatBits(SIMD::UInt halfBits);
+SIMD::UInt floatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits);
+SIMD::Float linearToSRGB(const SIMD::Float &c);
+SIMD::Float sRGBtoLinear(const SIMD::Float &c);
+
 RValue<Float4> reciprocal(RValue<Float4> x, bool pp = false, bool exactAtPow2 = false);
+RValue<SIMD::Float> reciprocal(RValue<SIMD::Float> x, bool pp = false, bool exactAtPow2 = false);
 RValue<Float4> reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
 
 RValue<SIMD::Float> mulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z);  // TODO(chromium:1299047)
 
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision);
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision);
+
+// clang-format off
+template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
+template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
+
+template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
+template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
+template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
+// clang-format on
+
 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
@@ -146,8 +171,8 @@
 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
 
-sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
-sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
+UInt4 halfToFloatBits(RValue<UInt4> halfBits);
+UInt4 floatToHalfBits(RValue<UInt4> floatBits, bool storeInUpperBits);
 Float4 r11g11b10Unpack(UInt r11g11b10bits);
 UInt r11g11b10Pack(const Float4 &value);
 Float4 linearToSRGB(const Float4 &c);
diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index ceaa0d8..ee4cca3 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp
@@ -154,9 +154,9 @@
 		SIMD::Float uvwa[4];
 		SIMD::Float dRef;
 		SIMD::Float lodOrBias;  // Explicit level-of-detail, or bias added to the implicit level-of-detail (depending on samplerMethod).
-		Vector4f dsx;
-		Vector4f dsy;
-		Vector4i offset;
+		SIMD::Float dsx[4];
+		SIMD::Float dsy[4];
+		SIMD::Int offset[4];
 		SIMD::Int sampleId;
 		SamplerFunction samplerFunction = instruction.getSamplerFunction();
 
@@ -216,15 +216,15 @@
 			{
 				SIMD::Float dPdx;
 				SIMD::Float dPdy;
-				dPdx.x = Pointer<Float>(&dsx.x)[i];
-				dPdx.y = Pointer<Float>(&dsx.y)[i];
-				dPdx.z = Pointer<Float>(&dsx.z)[i];
+				dPdx.x = Pointer<Float>(&dsx[0])[i];
+				dPdx.y = Pointer<Float>(&dsx[1])[i];
+				dPdx.z = Pointer<Float>(&dsx[2])[i];
 
-				dPdy.x = Pointer<Float>(&dsy.x)[i];
-				dPdy.y = Pointer<Float>(&dsy.y)[i];
-				dPdy.z = Pointer<Float>(&dsy.z)[i];
+				dPdy.x = Pointer<Float>(&dsy[0])[i];
+				dPdy.y = Pointer<Float>(&dsy[1])[i];
+				dPdy.z = Pointer<Float>(&dsy[2])[i];
 
-				Vector4f sample = s.sampleTexture(texture, uvwa, dRef, lod[i], dPdx, dPdy, offset, sampleId);
+				SIMD::Float4 sample = s.sampleTexture(texture, uvwa, dRef, lod[i], dPdx, dPdy, offset, sampleId);
 
 				If(perLaneSampling)
 				{
@@ -249,7 +249,8 @@
 		}
 		else
 		{
-			Vector4f sample = s.sampleTexture(texture, uvwa, dRef, lodOrBias.x, (dsx.x), (dsy.x), offset, sampleId);
+			Float lod = Float(lodOrBias.x);
+			SIMD::Float4 sample = s.sampleTexture(texture, uvwa, dRef, lod, (dsx[0]), (dsy[0]), offset, sampleId);
 
 			Pointer<SIMD::Float> rgba = out;
 			rgba[0] = sample.x;
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 535dc97..4f6fb92 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -567,6 +567,8 @@
 
 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
 {
+	ASSERT(SIMD::Width == 4);
+
 	UInt index0 = batch[0];
 	UInt index1 = batch[1];
 	UInt index2 = batch[2];
@@ -590,7 +592,7 @@
 		assert(it->second.SizeInComponents == 4);
 		auto &position = routine.getVariable(it->second.Id);
 
-		Vector4f pos;
+		SIMD::Float4 pos;
 		pos.x = position[it->second.FirstComponent + 0];
 		pos.y = position[it->second.FirstComponent + 1];
 		pos.z = position[it->second.FirstComponent + 2];
@@ -600,30 +602,38 @@
 		SIMD::Float w = As<SIMD::Float>(As<SIMD::Int>(pos.w) | (As<SIMD::Int>(CmpEQ(pos.w, 0.0f)) & As<SIMD::Int>(SIMD::Float(1.0f))));
 		SIMD::Float rhw = 1.0f / w;
 
-		Vector4f proj;
+		SIMD::Float4 proj;
 		proj.x = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
 		proj.y = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
 		proj.z = pos.z * rhw;
 		proj.w = rhw;
 
-		transpose4x4(pos.x, pos.y, pos.z, pos.w);
+		Float4 pos_x = Extract128(pos.x, 0);
+		Float4 pos_y = Extract128(pos.y, 0);
+		Float4 pos_z = Extract128(pos.z, 0);
+		Float4 pos_w = Extract128(pos.w, 0);
+		transpose4x4(pos_x, pos_y, pos_z, pos_w);
 
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos_w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos_z;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos_y;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos_x;
 
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
 
-		transpose4x4(proj.x, proj.y, proj.z, proj.w);
+		Float4 proj_x = Extract128(proj.x, 0);
+		Float4 proj_y = Extract128(proj.y, 0);
+		Float4 proj_z = Extract128(proj.z, 0);
+		Float4 proj_w = Extract128(proj.w, 0);
+		transpose4x4(proj_x, proj_y, proj_z, proj_w);
 
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
-		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj_w;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj_z;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj_y;
+		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj_x;
 	}
 
 	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
@@ -679,10 +689,10 @@
 		   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 		{
 			Vector4f v;
-			v.x = routine.outputs[i + 0];
-			v.y = routine.outputs[i + 1];
-			v.z = routine.outputs[i + 2];
-			v.w = routine.outputs[i + 3];
+			v.x = Extract128(routine.outputs[i + 0], 0);
+			v.y = Extract128(routine.outputs[i + 1], 0);
+			v.z = Extract128(routine.outputs[i + 2], 0);
+			v.w = Extract128(routine.outputs[i + 3], 0);
 
 			transpose4x4(v.x, v.y, v.z, v.w);
 
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index cfe7004..6512b0d 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -360,7 +360,7 @@
 
 namespace rr {
 
-const int SIMD::Width = 8;
+const int SIMD::Width = 4;
 
 std::string Caps::backendName()
 {
@@ -1153,14 +1153,14 @@
 	}
 }
 
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
-	return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
+	return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
 }
 
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
-	return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
+	return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
 }
 
 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
@@ -1216,12 +1216,12 @@
 	}
 }
 
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
 {
 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
 }
 
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
 {
 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
 }
diff --git a/src/Reactor/Print.hpp b/src/Reactor/Print.hpp
index d52d3ab..21839ee 100644
--- a/src/Reactor/Print.hpp
+++ b/src/Reactor/Print.hpp
@@ -326,14 +326,25 @@
 	static std::vector<Value *> val(const RValue<Pointer<T>> &v) { return { v.value() }; }
 };
 template<>
-struct PrintValue::Ty<Pointer4>
+struct PrintValue::Ty<SIMD::Pointer>
 {
-	static std::string fmt(const Pointer4 &v)
+	static std::string fmt(const SIMD::Pointer &v)
 	{
-		return v.isBasePlusOffset ? "{%p + [%d, %d, %d, %d]}" : "{%p, %p, %p, %p}";
+		if(v.isBasePlusOffset)
+		{
+			std::string format;
+			for(int i = 1; i < SIMD::Width; i++) { format += ", %p"; }
+			return "{%p + [%d" + format + "]}";
+		}
+		else
+		{
+			std::string format;
+			for(int i = 1; i < SIMD::Width; i++) { format += ", %p"; }
+			return "{%p" + format + "}";
+		}
 	}
 
-	static std::vector<Value *> val(const Pointer4 &v)
+	static std::vector<Value *> val(const SIMD::Pointer &v)
 	{
 		return v.getPrintValues();
 	}
diff --git a/src/Reactor/SIMD.cpp b/src/Reactor/SIMD.cpp
index 11f636b..ddf2d56 100644
--- a/src/Reactor/SIMD.cpp
+++ b/src/Reactor/SIMD.cpp
@@ -42,6 +42,21 @@
 	storeValue(Nucleus::createConstantVector(constantVector, type()));
 }
 
+SIMD::Int::Int(int x, int y, int z, int w)
+    : XYZW(this)
+{
+	std::vector<int64_t> constantVector = { x, y, z, w };
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::Int::Int(std::vector<int> v)
+    : XYZW(this)
+{
+	std::vector<int64_t> constantVector;
+	for(int i : v) { constantVector.push_back(i); }
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
 SIMD::Int::Int(RValue<SIMD::Int> rhs)
     : XYZW(this)
 {
@@ -247,6 +262,21 @@
 	storeValue(Nucleus::createConstantVector(constantVector, type()));
 }
 
+SIMD::UInt::UInt(int x, int y, int z, int w)
+    : XYZW(this)
+{
+	std::vector<int64_t> constantVector = { x, y, z, w };
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::UInt::UInt(std::vector<int> v)
+    : XYZW(this)
+{
+	std::vector<int64_t> constantVector;
+	for(int i : v) { constantVector.push_back(i); }
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
 SIMD::UInt::UInt(RValue<SIMD::UInt> rhs)
     : XYZW(this)
 {
@@ -467,6 +497,21 @@
 	storeValue(Nucleus::createConstantVector(constantVector, type()));
 }
 
+SIMD::Float::Float(float x, float y, float z, float w)
+    : XYZW(this)
+{
+	std::vector<double> constantVector = { x, y, z, w };
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::Float::Float(std::vector<float> v)
+    : XYZW(this)
+{
+	std::vector<double> constantVector;
+	for(int f : v) { constantVector.push_back(f); }
+	storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
 SIMD::Float SIMD::Float::infinity()
 {
 	SIMD::Float result;
@@ -508,6 +553,18 @@
 	*this = RValue<scalar::Float>(rhs.loadValue());
 }
 
+SIMD::Float::Float(RValue<packed::Float4> rhs)
+    : XYZW(this)
+{
+	ASSERT(SIMD::Width == 4);
+	*this = Insert128(*this, rhs, 0);
+}
+
+RValue<SIMD::Float> SIMD::Float::operator=(RValue<packed::Float4> rhs)
+{
+	return *this = SIMD::Float(rhs);
+}
+
 RValue<SIMD::Float> SIMD::Float::operator=(float x)
 {
 	return *this = SIMD::Float(x);
@@ -598,6 +655,18 @@
 	return RValue<SIMD::Float>(Nucleus::createFNeg(val.value()));
 }
 
+RValue<SIMD::Float> Rcp(RValue<SIMD::Float> x, bool relaxedPrecision, bool exactAtPow2)
+{
+	ASSERT(SIMD::Width == 4);
+	return SIMD::Float(Rcp(Extract128(x, 0), relaxedPrecision, exactAtPow2));
+}
+
+RValue<SIMD::Float> RcpSqrt(RValue<SIMD::Float> x, bool relaxedPrecision)
+{
+	ASSERT(SIMD::Width == 4);
+	return SIMD::Float(RcpSqrt(Extract128(x, 0), relaxedPrecision));
+}
+
 RValue<SIMD::Float> Insert(RValue<SIMD::Float> x, RValue<scalar::Float> element, int i)
 {
 	return RValue<SIMD::Float>(Nucleus::createInsertElement(x.value(), element.value(), i));
@@ -802,81 +871,81 @@
 	return Insert128(result, Shuffle(Extract128(x, 0), Extract128(y, 0), select), 0);
 }
 
-Pointer4::Pointer4(Pointer<Byte> base, rr::Int limit)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, rr::Int limit)
     : base(base)
     , dynamicLimit(limit)
     , staticLimit(0)
     , dynamicOffsets(0)
-    , staticOffsets(4)
+    , staticOffsets(SIMD::Width)
     , hasDynamicLimit(true)
     , hasDynamicOffsets(false)
     , isBasePlusOffset(true)
 {}
 
-Pointer4::Pointer4(Pointer<Byte> base, unsigned int limit)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, unsigned int limit)
     : base(base)
     , dynamicLimit(0)
     , staticLimit(limit)
     , dynamicOffsets(0)
-    , staticOffsets(4)
+    , staticOffsets(SIMD::Width)
     , hasDynamicLimit(false)
     , hasDynamicOffsets(false)
     , isBasePlusOffset(true)
 {}
 
-Pointer4::Pointer4(Pointer<Byte> base, rr::Int limit, Int4 offset)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
     : base(base)
     , dynamicLimit(limit)
     , staticLimit(0)
     , dynamicOffsets(offset)
-    , staticOffsets(4)
+    , staticOffsets(SIMD::Width)
     , hasDynamicLimit(true)
     , hasDynamicOffsets(true)
     , isBasePlusOffset(true)
 {}
 
-Pointer4::Pointer4(Pointer<Byte> base, unsigned int limit, Int4 offset)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
     : base(base)
     , dynamicLimit(0)
     , staticLimit(limit)
     , dynamicOffsets(offset)
-    , staticOffsets(4)
+    , staticOffsets(SIMD::Width)
     , hasDynamicLimit(false)
     , hasDynamicOffsets(true)
     , isBasePlusOffset(true)
 {}
 
-Pointer4::Pointer4(std::vector<Pointer<Byte>> pointers)
+SIMD::Pointer::Pointer(std::vector<scalar::Pointer<Byte>> pointers)
     : pointers(pointers)
     , isBasePlusOffset(false)
 {}
 
-Pointer4::Pointer4(UInt4 cast)
-    : pointers(4)
+SIMD::Pointer::Pointer(SIMD::UInt cast)
+    : pointers(SIMD::Width)
     , isBasePlusOffset(false)
 {
 	assert(sizeof(void *) == 4);
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
-		pointers[i] = As<Pointer<Byte>>(Extract(cast, i));
+		pointers[i] = As<rr::Pointer<Byte>>(Extract(cast, i));
 	}
 }
 
-Pointer4::Pointer4(UInt4 castLow, UInt4 castHigh)
-    : pointers(4)
+SIMD::Pointer::Pointer(SIMD::UInt castLow, SIMD::UInt castHigh)
+    : pointers(SIMD::Width)
     , isBasePlusOffset(false)
 {
 	assert(sizeof(void *) == 8);
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		UInt2 address;
 		address = Insert(address, Extract(castLow, i), 0);
 		address = Insert(address, Extract(castHigh, i), 1);
-		pointers[i] = As<Pointer<Byte>>(address);
+		pointers[i] = As<rr::Pointer<Byte>>(address);
 	}
 }
 
-Pointer4 &Pointer4::operator+=(Int4 i)
+SIMD::Pointer &SIMD::Pointer::operator+=(SIMD::Int i)
 {
 	if(isBasePlusOffset)
 	{
@@ -885,67 +954,68 @@
 	}
 	else
 	{
-		for(int el = 0; el < 4; el++) { pointers[el] += Extract(i, el); }
+		for(int el = 0; el < SIMD::Width; el++) { pointers[el] += Extract(i, el); }
 	}
 	return *this;
 }
 
-Pointer4 Pointer4::operator+(Int4 i)
+SIMD::Pointer SIMD::Pointer::operator+(SIMD::Int i)
 {
-	Pointer4 p = *this;
+	SIMD::Pointer p = *this;
 	p += i;
 	return p;
 }
 
-Pointer4 &Pointer4::operator+=(int i)
+SIMD::Pointer &SIMD::Pointer::operator+=(int i)
 {
 	if(isBasePlusOffset)
 	{
-		for(int el = 0; el < 4; el++) { staticOffsets[el] += i; }
+		for(int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
 	}
 	else
 	{
-		for(int el = 0; el < 4; el++) { pointers[el] += i; }
+		for(int el = 0; el < SIMD::Width; el++) { pointers[el] += i; }
 	}
 	return *this;
 }
 
-Pointer4 Pointer4::operator+(int i)
+SIMD::Pointer SIMD::Pointer::operator+(int i)
 {
-	Pointer4 p = *this;
+	SIMD::Pointer p = *this;
 	p += i;
 	return p;
 }
 
-Int4 Pointer4::offsets() const
+SIMD::Int SIMD::Pointer::offsets() const
 {
 	ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
-	return dynamicOffsets + Int4(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
+	return dynamicOffsets + SIMD::Int(staticOffsets);
 }
 
-Int4 Pointer4::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+SIMD::Int SIMD::Pointer::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
 {
 	ASSERT(accessSize > 0);
 
 	if(isStaticallyInBounds(accessSize, robustness))
 	{
-		return Int4(0xFFFFFFFF);
+		return SIMD::Int(0xFFFFFFFF);
 	}
 
 	if(!hasDynamicOffsets && !hasDynamicLimit)
 	{
+		ASSERT(SIMD::Width == 4);
 		// Common fast paths.
-		return Int4(
+		return SIMD::Int(
 		    (staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
 		    (staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
 		    (staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
 		    (staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0);
 	}
 
-	return CmpGE(offsets(), Int4(0)) & CmpLT(offsets() + Int4(accessSize - 1), Int4(limit()));
+	return CmpGE(offsets(), 0) & CmpLT(offsets() + SIMD::Int(accessSize - 1), limit());
 }
 
-bool Pointer4::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+bool SIMD::Pointer::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
 {
 	if(hasDynamicOffsets)
 	{
@@ -970,7 +1040,7 @@
 		}
 	}
 
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		if(staticOffsets[i] + accessSize - 1 >= staticLimit)
 		{
@@ -981,14 +1051,14 @@
 	return true;
 }
 
-rr::Int Pointer4::limit() const
+SIMD::Int SIMD::Pointer::limit() const
 {
 	return dynamicLimit + staticLimit;
 }
 
 // Returns true if all offsets are compile-time static and sequential
 // (N+0*step, N+1*step, N+2*step, N+3*step)
-bool Pointer4::hasStaticSequentialOffsets(unsigned int step) const
+bool SIMD::Pointer::hasStaticSequentialOffsets(unsigned int step) const
 {
 	ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
 	if(hasDynamicOffsets)
@@ -996,7 +1066,7 @@
 		return false;
 	}
 
-	for(int i = 1; i < 4; i++)
+	for(int i = 1; i < SIMD::Width; i++)
 	{
 		if(staticOffsets[i - 1] + int32_t(step) != staticOffsets[i])
 		{
@@ -1009,7 +1079,7 @@
 
 // Returns true if all offsets are compile-time static and equal
 // (N, N, N, N)
-bool Pointer4::hasStaticEqualOffsets() const
+bool SIMD::Pointer::hasStaticEqualOffsets() const
 {
 	ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
 	if(hasDynamicOffsets)
@@ -1017,7 +1087,7 @@
 		return false;
 	}
 
-	for(int i = 1; i < 4; i++)
+	for(int i = 1; i < SIMD::Width; i++)
 	{
 		if(staticOffsets[0] != staticOffsets[i])
 		{
@@ -1028,22 +1098,22 @@
 	return true;
 }
 
-Pointer<Byte> Pointer4::getUniformPointer() const
+scalar::Pointer<Byte> SIMD::Pointer::getUniformPointer() const
 {
 #ifndef NDEBUG
 	if(isBasePlusOffset)
 	{
-		Int4 uniform = offsets();
-		Int x = Extract(uniform, 0);
+		SIMD::Int uniform = offsets();
+		scalar::Int x = Extract(uniform, 0);
 
-		for(int i = 1; i < 4; i++)
+		for(int i = 1; i < SIMD::Width; i++)
 		{
 			Assert(x == Extract(uniform, i));
 		}
 	}
 	else
 	{
-		for(int i = 1; i < 4; i++)
+		for(int i = 1; i < SIMD::Width; i++)
 		{
 			Assert(pointers[0] == pointers[i]);
 		}
@@ -1053,7 +1123,7 @@
 	return getPointerForLane(0);
 }
 
-Pointer<Byte> Pointer4::getPointerForLane(int lane) const
+scalar::Pointer<Byte> SIMD::Pointer::getPointerForLane(int lane) const
 {
 	if(isBasePlusOffset)
 	{
@@ -1065,19 +1135,19 @@
 	}
 }
 
-void Pointer4::castTo(UInt4 &bits) const
+void SIMD::Pointer::castTo(SIMD::UInt &bits) const
 {
 	assert(sizeof(void *) == 4);
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
-		bits = Insert(bits, As<UInt>(pointers[i]), i);
+		bits = Insert(bits, As<scalar::UInt>(pointers[i]), i);
 	}
 }
 
-void Pointer4::castTo(UInt4 &lowerBits, UInt4 &upperBits) const
+void SIMD::Pointer::castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const
 {
 	assert(sizeof(void *) == 8);
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		UInt2 address = As<UInt2>(pointers[i]);
 		lowerBits = Insert(lowerBits, Extract(address, 0), i);
@@ -1085,10 +1155,10 @@
 	}
 }
 
-Pointer4 Pointer4::IfThenElse(Int4 condition, const Pointer4 &lhs, const Pointer4 &rhs)
+SIMD::Pointer SIMD::Pointer::IfThenElse(SIMD::Int condition, const SIMD::Pointer &lhs, const SIMD::Pointer &rhs)
 {
-	std::vector<Pointer<Byte>> pointers(4);
-	for(int i = 0; i < 4; i++)
+	std::vector<scalar::Pointer<Byte>> pointers(SIMD::Width);
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		If(Extract(condition, i) != 0)
 		{
@@ -1104,7 +1174,7 @@
 }
 
 #ifdef ENABLE_RR_PRINT
-std::vector<rr::Value *> Pointer4::getPrintValues() const
+std::vector<rr::Value *> SIMD::Pointer::getPrintValues() const
 {
 	if(isBasePlusOffset)
 	{
@@ -1112,7 +1182,12 @@
 	}
 	else
 	{
-		return PrintValue::vals(pointers[0], pointers[1], pointers[2], pointers[3]);
+		std::vector<Value *> vals;
+		for(int i = 0; i < SIMD::Width; i++)
+		{
+			vals.push_back(RValue<scalar::Pointer<Byte>>(pointers[i]).value());
+		}
+		return vals;
 	}
 }
 #endif
diff --git a/src/Reactor/SIMD.hpp b/src/Reactor/SIMD.hpp
index c1d2783..6508704 100644
--- a/src/Reactor/SIMD.hpp
+++ b/src/Reactor/SIMD.hpp
@@ -25,6 +25,8 @@
 using Int = rr::Int;
 using UInt = rr::UInt;
 using Float = rr::Float;
+template<class T>
+using Pointer = rr::Pointer<T>;
 }  // namespace scalar
 
 namespace packed {
@@ -40,6 +42,7 @@
 class Int;
 class UInt;
 class Float;
+class Pointer;
 
 class Int : public LValue<SIMD::Int>,
             public XYZW<SIMD::Int>  // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics.
@@ -49,6 +52,8 @@
 
 	Int();
 	Int(int broadcast);
+	Int(int x, int y, int z, int w);
+	Int(std::vector<int> v);
 	Int(RValue<SIMD::Int> rhs);
 	Int(const Int &rhs);
 	Int(const Reference<SIMD::Int> &rhs);
@@ -59,6 +64,9 @@
 	Int(const scalar::Int &rhs);
 	Int(const Reference<scalar::Int> &rhs);
 
+	template<int T>
+	Int(const SwizzleMask1<packed::Int4, T> &rhs);
+
 	RValue<SIMD::Int> operator=(int broadcast);
 	RValue<SIMD::Int> operator=(RValue<SIMD::Int> rhs);
 	RValue<SIMD::Int> operator=(const Int &rhs);
@@ -76,6 +84,8 @@
 
 	UInt();
 	UInt(int broadcast);
+	UInt(int x, int y, int z, int w);
+	UInt(std::vector<int> v);
 	UInt(RValue<SIMD::UInt> rhs);
 	UInt(const UInt &rhs);
 	UInt(const Reference<SIMD::UInt> &rhs);
@@ -103,6 +113,8 @@
 
 	Float();
 	Float(float broadcast);
+	Float(float x, float y, float z, float w);
+	Float(std::vector<float> v);
 	Float(RValue<SIMD::Float> rhs);
 	Float(const Float &rhs);
 	Float(const Reference<SIMD::Float> &rhs);
@@ -110,6 +122,11 @@
 	Float(const scalar::Float &rhs);
 	Float(const Reference<scalar::Float> &rhs);
 
+	Float(RValue<packed::Float4> rhs);
+	RValue<SIMD::Float> operator=(RValue<packed::Float4> rhs);
+	template<int T>
+	Float(const SwizzleMask1<packed::Float4, T> &rhs);
+
 	RValue<SIMD::Float> operator=(float broadcast);
 	RValue<SIMD::Float> operator=(RValue<SIMD::Float> rhs);
 	RValue<SIMD::Float> operator=(const Float &rhs);
@@ -124,27 +141,25 @@
 	static int element_count() { return SIMD::Width; }
 };
 
-}  // namespace SIMD
-
-class Pointer4
+class Pointer
 {
 public:
-	Pointer4(Pointer<Byte> base, Int limit);
-	Pointer4(Pointer<Byte> base, unsigned int limit);
-	Pointer4(Pointer<Byte> base, Int limit, Int4 offset);
-	Pointer4(Pointer<Byte> base, unsigned int limit, Int4 offset);
-	Pointer4(std::vector<Pointer<Byte>> pointers);
-	explicit Pointer4(UInt4 cast);                      // Cast from 32-bit integers to 32-bit pointers
-	explicit Pointer4(UInt4 castLow, UInt4 castHight);  // Cast from pairs of 32-bit integers to 64-bit pointers
+	Pointer(scalar::Pointer<Byte> base, scalar::Int limit);
+	Pointer(scalar::Pointer<Byte> base, unsigned int limit);
+	Pointer(scalar::Pointer<Byte> base, scalar::Int limit, SIMD::Int offset);
+	Pointer(scalar::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
+	Pointer(std::vector<scalar::Pointer<Byte>> pointers);
+	explicit Pointer(SIMD::UInt cast);                           // Cast from 32-bit integers to 32-bit pointers
+	explicit Pointer(SIMD::UInt castLow, SIMD::UInt castHight);  // Cast from pairs of 32-bit integers to 64-bit pointers
 
-	Pointer4 &operator+=(Int4 i);
-	Pointer4 operator+(Int4 i);
-	Pointer4 &operator+=(int i);
-	Pointer4 operator+(int i);
+	Pointer &operator+=(SIMD::Int i);
+	Pointer operator+(SIMD::Int i);
+	Pointer &operator+=(int i);
+	Pointer operator+(int i);
 
-	Int4 offsets() const;
+	SIMD::Int offsets() const;
 
-	Int4 isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
 	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
@@ -159,20 +174,20 @@
 	bool hasStaticEqualOffsets() const;
 
 	template<typename T>
-	inline T Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+	inline T Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
 
 	template<typename T>
-	inline void Store(T val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+	inline void Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
 
 	template<typename T>
-	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
 
-	Pointer<Byte> getUniformPointer() const;
-	Pointer<Byte> getPointerForLane(int lane) const;
-	static Pointer4 IfThenElse(Int4 condition, const Pointer4 &lhs, const Pointer4 &rhs);
+	scalar::Pointer<Byte> getUniformPointer() const;
+	scalar::Pointer<Byte> getPointerForLane(int lane) const;
+	static Pointer IfThenElse(SIMD::Int condition, const Pointer &lhs, const Pointer &rhs);
 
-	void castTo(UInt4 &bits) const;                         // Cast from 32-bit pointers to 32-bit integers
-	void castTo(UInt4 &lowerBits, UInt4 &upperBits) const;  // Cast from 64-bit pointers to pairs of 32-bit integers
+	void castTo(SIMD::UInt &bits) const;                              // Cast from 32-bit pointers to 32-bit integers
+	void castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const;  // Cast from 64-bit pointers to pairs of 32-bit integers
 
 #ifdef ENABLE_RR_PRINT
 	std::vector<rr::Value *> getPrintValues() const;
@@ -180,24 +195,26 @@
 
 private:
 	// Base address for the pointer, common across all lanes.
-	Pointer<Byte> base;
+	scalar::Pointer<Byte> base;
 	// Per-lane address for dealing with non-uniform data
-	std::vector<Pointer<Byte>> pointers;
+	std::vector<scalar::Pointer<Byte>> pointers;
 
 public:
 	// Upper (non-inclusive) limit for offsets from base.
-	Int dynamicLimit;  // If hasDynamicLimit is false, dynamicLimit is zero.
+	scalar::Int dynamicLimit;  // If hasDynamicLimit is false, dynamicLimit is zero.
 	unsigned int staticLimit = 0;
 
 	// Per lane offsets from base.
-	Int4 dynamicOffsets;  // If hasDynamicOffsets is false, all dynamicOffsets are zero.
+	SIMD::Int dynamicOffsets;  // If hasDynamicOffsets is false, all dynamicOffsets are zero.
 	std::vector<int32_t> staticOffsets;
 
 	bool hasDynamicLimit = false;    // True if dynamicLimit is non-zero.
 	bool hasDynamicOffsets = false;  // True if any dynamicOffsets are non-zero.
-	bool isBasePlusOffset = false;   // True if this uses base+offsets. False if this is a collection of Pointers
+	bool isBasePlusOffset = false;   // True if this uses base+offset. False if this is a collection of Pointers
 };
 
+}  // namespace SIMD
+
 RValue<SIMD::Int> operator+(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
 RValue<SIMD::Int> operator-(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
 RValue<SIMD::Int> operator*(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
@@ -429,10 +446,10 @@
 RValue<SIMD::UInt> Shuffle(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y, uint16_t select);
 RValue<SIMD::Float> Shuffle(RValue<SIMD::Float> x, RValue<SIMD::Float> y, uint16_t select);
 
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
 
 template<>
 inline RValue<SIMD::Int>::RValue(int i)
@@ -455,38 +472,33 @@
 	RR_DEBUG_INFO_EMIT_VAR(val);
 }
 
-template<typename T>
-struct Element
-{};
-template<>
-struct Element<Float4>
+template<int T>
+SIMD::Int::Int(const SwizzleMask1<packed::Int4, T> &rhs)
+    : XYZW(this)
 {
-	using type = Float;
-};
-template<>
-struct Element<Int4>
+	*this = rhs.operator RValue<scalar::Int>();
+}
+
+template<int T>
+SIMD::Float::Float(const SwizzleMask1<packed::Float4, T> &rhs)
+    : XYZW(this)
 {
-	using type = Int;
-};
-template<>
-struct Element<UInt4>
-{
-	using type = UInt;
-};
+	*this = rhs.operator RValue<scalar::Float>();
+}
 
 template<typename T>
-inline T Pointer4::Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
 {
-	using EL = typename Element<T>::type;
+	using EL = typename Scalar<T>::Type;
 
 	if(!isBasePlusOffset)
 	{
 		T out = T(0);
-		for(int i = 0; i < 4; i++)
+		for(int i = 0; i < SIMD::Width; i++)
 		{
 			If(Extract(mask, i) != 0)
 			{
-				auto el = rr::Load(Pointer<EL>(pointers[i]), alignment, atomic, order);
+				auto el = rr::Load(scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
 				out = Insert(out, el, i);
 			}
 		}
@@ -501,13 +513,13 @@
 		if(hasStaticSequentialOffsets(sizeof(float)))
 		{
 			// Offsets are sequential. Perform regular load.
-			return rr::Load(Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
+			return rr::Load(scalar::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
 		}
 
 		if(hasStaticEqualOffsets())
 		{
 			// Load one, replicate.
-			return T(*Pointer<EL>(base + staticOffsets[0], alignment));
+			return T(*scalar::Pointer<EL>(base + staticOffsets[0], alignment));
 		}
 	}
 	else
@@ -537,7 +549,7 @@
 			T out = T(0);
 			If(AnyTrue(mask))
 			{
-				EL el = *Pointer<EL>(base + staticOffsets[0], alignment);
+				EL el = *scalar::Pointer<EL>(base + staticOffsets[0], alignment);
 				out = T(el);
 			}
 			return out;
@@ -558,7 +570,7 @@
 
 		// TODO(b/195446858): Optimize static sequential offsets case by using masked load.
 
-		return Gather(Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
+		return Gather(scalar::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
 	}
 	else
 	{
@@ -568,24 +580,24 @@
 		{
 			// Load one, replicate.
 			auto offset = Extract(offs, 0);
-			out = T(rr::Load(Pointer<EL>(&base[offset]), alignment, atomic, order));
+			out = T(rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order));
 		}
 		Else If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
 		{
 			// Load all elements in a single SIMD instruction.
 			auto offset = Extract(offs, 0);
-			out = rr::Load(Pointer<T>(&base[offset]), alignment, atomic, order);
+			out = rr::Load(scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
 		}
 		Else
 		{
 			// Divergent offsets or masked lanes.
 			out = T(0);
-			for(int i = 0; i < 4; i++)
+			for(int i = 0; i < SIMD::Width; i++)
 			{
 				If(Extract(mask, i) != 0)
 				{
 					auto offset = Extract(offs, i);
-					auto el = rr::Load(Pointer<EL>(&base[offset]), alignment, atomic, order);
+					auto el = rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
 					out = Insert(out, el, i);
 				}
 			}
@@ -595,34 +607,34 @@
 }
 
 template<>
-inline Pointer4 Pointer4::Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+inline SIMD::Pointer SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
 {
-	std::vector<Pointer<Byte>> pointers(4);
+	std::vector<scalar::Pointer<Byte>> pointers(SIMD::Width);
 
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		If(Extract(mask, i) != 0)
 		{
-			pointers[i] = rr::Load(Pointer<Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
+			pointers[i] = rr::Load(scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
 		}
 	}
 
-	return Pointer4(pointers);
+	return SIMD::Pointer(pointers);
 }
 
 template<typename T>
-inline void Pointer4::Store(T val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 {
-	using EL = typename Element<T>::type;
+	using EL = typename Scalar<T>::Type;
 	constexpr size_t alignment = sizeof(float);
 
 	if(!isBasePlusOffset)
 	{
-		for(int i = 0; i < 4; i++)
+		for(int i = 0; i < SIMD::Width; i++)
 		{
 			If(Extract(mask, i) != 0)
 			{
-				rr::Store(Extract(val, i), Pointer<EL>(pointers[i]), alignment, atomic, order);
+				rr::Store(Extract(val, i), scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
 			}
 		}
 		return;
@@ -647,15 +659,17 @@
 		{
 			If(AnyTrue(mask))
 			{
+				assert(SIMD::Width == 4);
+
 				// All equal. One of these writes will win -- elect the winning lane.
-				auto v0111 = Int4(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
-				auto maskedVal = As<Int4>(val) & elect;
+				auto maskedVal = As<SIMD::Int>(val) & elect;
 				auto scalarVal = Extract(maskedVal, 0) |
 				                 Extract(maskedVal, 1) |
 				                 Extract(maskedVal, 2) |
 				                 Extract(maskedVal, 3);
-				*Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+				*scalar::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
 			}
 		}
 		else if(hasStaticSequentialOffsets(sizeof(float)) &&
@@ -664,13 +678,13 @@
 			// TODO(b/195446858): Optimize using masked store.
 			// Pointer has no elements OOB, and the store is not atomic.
 			// Perform a read-modify-write.
-			auto p = Pointer<Int4>(base + staticOffsets[0], alignment);
+			auto p = scalar::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
 			auto prev = *p;
-			*p = (prev & ~mask) | (As<Int4>(val) & mask);
+			*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
 		}
 		else
 		{
-			Scatter(Pointer<EL>(base), val, offs, mask, alignment);
+			Scatter(scalar::Pointer<EL>(base), val, offs, mask, alignment);
 		}
 	}
 	else
@@ -680,17 +694,17 @@
 		{
 			// Store all elements in a single SIMD instruction.
 			auto offset = Extract(offs, 0);
-			rr::Store(val, Pointer<T>(&base[offset]), alignment, atomic, order);
+			rr::Store(val, scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
 		}
 		Else
 		{
 			// Divergent offsets or masked lanes.
-			for(int i = 0; i < 4; i++)
+			for(int i = 0; i < SIMD::Width; i++)
 			{
 				If(Extract(mask, i) != 0)
 				{
 					auto offset = Extract(offs, i);
-					rr::Store(Extract(val, i), Pointer<EL>(&base[offset]), alignment, atomic, order);
+					rr::Store(Extract(val, i), scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
 				}
 			}
 		}
@@ -698,21 +712,21 @@
 }
 
 template<>
-inline void Pointer4::Store(Pointer4 val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(SIMD::Pointer val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 {
 	constexpr size_t alignment = sizeof(void *);
 
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		If(Extract(mask, i) != 0)
 		{
-			rr::Store(val.getPointerForLane(i), Pointer<Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
+			rr::Store(val.getPointerForLane(i), scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
 		}
 	}
 }
 
 template<typename T>
-inline void Pointer4::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
 {
 	Store(T(val), robustness, mask, atomic, order);
 }
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 206e578..8f8bd70 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4103,7 +4103,7 @@
 using UnderlyingTypeT = typename UnderlyingType<T>::Type;
 
 template<typename T, typename EL = UnderlyingTypeT<T>>
-static void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+static void gather(T &out, RValue<Pointer<EL>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes)
 {
 	constexpr bool atomic = false;
 	constexpr std::memory_order order = std::memory_order_relaxed;
@@ -4111,7 +4111,7 @@
 	Pointer<Byte> baseBytePtr = base;
 
 	out = T(0);
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		If(Extract(mask, i) != 0)
 		{
@@ -4127,14 +4127,14 @@
 }
 
 template<typename T, typename EL = UnderlyingTypeT<T>>
-static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
 {
 	constexpr bool atomic = false;
 	constexpr std::memory_order order = std::memory_order_relaxed;
 
 	Pointer<Byte> baseBytePtr = base;
 
-	for(int i = 0; i < 4; i++)
+	for(int i = 0; i < SIMD::Width; i++)
 	{
 		If(Extract(mask, i) != 0)
 		{
@@ -4144,32 +4144,32 @@
 	}
 }
 
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	Float4 result{};
+	SIMD::Float result{};
 	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
 	return result;
 }
 
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	Int4 result{};
+	SIMD::Int result{};
 	gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
 	return result;
 }
 
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
 	scatter(base, val, offsets, mask, alignment);
 }
 
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	scatter<Int4>(base, val, offsets, mask, alignment);
+	scatter<SIMD::Int>(base, val, offsets, mask, alignment);
 }
 
 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
diff --git a/tests/ReactorUnitTests/ReactorSIMD.cpp b/tests/ReactorUnitTests/ReactorSIMD.cpp
index de2deda..a4ebe8e 100644
--- a/tests/ReactorUnitTests/ReactorSIMD.cpp
+++ b/tests/ReactorUnitTests/ReactorSIMD.cpp
@@ -136,79 +136,73 @@
 
 TEST(ReactorSIMD, Intrinsics_Scatter)
 {
-	Function<Void(Pointer<Float> base, Pointer<Float4> val, Pointer<Int4> offsets)> function;
+	Function<Void(Pointer<Float> base, Pointer<SIMD::Float> val, Pointer<SIMD::Int> offsets)> function;
 	{
 		Pointer<Float> base = function.Arg<0>();
-		Pointer<Float4> val = function.Arg<1>();
-		Pointer<Int4> offsets = function.Arg<2>();
+		Pointer<SIMD::Float> val = function.Arg<1>();
+		Pointer<SIMD::Int> offsets = function.Arg<2>();
 
-		auto mask = Int4(~0, ~0, ~0, ~0);
+		SIMD::Int mask = ~0;
 		unsigned int alignment = 1;
 		Scatter(base, *val, *offsets, mask, alignment);
 	}
 
-	float buffer[16] = { 0 };
+	std::vector<float> buffer(10 + 10 * SIMD::Width);
+	std::vector<int> offsets(SIMD::Width);
+	std::vector<float> val(SIMD::Width);
 
-	constexpr auto elemSize = sizeof(buffer[0]);
-
-	int offsets[] = {
-		1 * elemSize,
-		6 * elemSize,
-		11 * elemSize,
-		13 * elemSize
-	};
-
-	float val[4] = { 10, 60, 110, 130 };
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		offsets[i] = (3 + 7 * i) * sizeof(float);
+		val[i] = 13.0f + 17.0f * i;
+	}
 
 	auto routine = function(testName().c_str());
 	auto entry = (void (*)(float *, float *, int *))routine->getEntry();
 
-	entry(buffer, val, offsets);
+	entry(buffer.data(), val.data(), offsets.data());
 
-	EXPECT_EQ(buffer[offsets[0] / sizeof(buffer[0])], 10);
-	EXPECT_EQ(buffer[offsets[1] / sizeof(buffer[0])], 60);
-	EXPECT_EQ(buffer[offsets[2] / sizeof(buffer[0])], 110);
-	EXPECT_EQ(buffer[offsets[3] / sizeof(buffer[0])], 130);
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		EXPECT_EQ(buffer[offsets[i] / sizeof(float)], val[i]);
+	}
 }
 
-TEST(ReactorUnitTests, Intrinsics_Gather)
+TEST(ReactorSIMD, Intrinsics_Gather)
 {
-	Function<Void(Pointer<Float> base, Pointer<Int4> offsets, Pointer<Float4> result)> function;
+	Function<Void(Pointer<Float> base, Pointer<SIMD::Int> offsets, Pointer<SIMD::Float> result)> function;
 	{
 		Pointer<Float> base = function.Arg<0>();
-		Pointer<Int4> offsets = function.Arg<1>();
-		Pointer<Float4> result = function.Arg<2>();
+		Pointer<SIMD::Int> offsets = function.Arg<1>();
+		Pointer<SIMD::Float> result = function.Arg<2>();
 
-		auto mask = Int4(~0, ~0, ~0, ~0);
+		SIMD::Int mask = ~0;
 		unsigned int alignment = 1;
 		bool zeroMaskedLanes = true;
 		*result = Gather(base, *offsets, mask, alignment, zeroMaskedLanes);
 	}
 
-	float buffer[] = {
-		0, 10, 20, 30,
-		40, 50, 60, 70,
-		80, 90, 100, 110,
-		120, 130, 140, 150
-	};
+	std::vector<float> buffer(10 + 10 * SIMD::Width);
+	std::vector<int> offsets(SIMD::Width);
 
-	constexpr auto elemSize = sizeof(buffer[0]);
+	std::vector<float> val(SIMD::Width);
 
-	int offsets[] = {
-		1 * elemSize,
-		6 * elemSize,
-		11 * elemSize,
-		13 * elemSize
-	};
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		offsets[i] = (3 + 7 * i) * sizeof(float);
+		val[i] = 13.0f + 17.0f * i;
+
+		buffer[offsets[i] / sizeof(float)] = val[i];
+	}
 
 	auto routine = function(testName().c_str());
 	auto entry = (void (*)(float *, int *, float *))routine->getEntry();
 
-	float result[4] = {};
-	entry(buffer, offsets, result);
+	std::vector<float> result(SIMD::Width);
+	entry(buffer.data(), offsets.data(), result.data());
 
-	EXPECT_EQ(result[0], 10);
-	EXPECT_EQ(result[1], 60);
-	EXPECT_EQ(result[2], 110);
-	EXPECT_EQ(result[3], 130);
+	for(int i = 0; i < SIMD::Width; i++)
+	{
+		EXPECT_EQ(result[i], val[i]);
+	}
 }