Pipeline: Move utility functions to ShaderCore

SpirvShader had a lot of utility functions in an anonymous namespace
at the top of the file. Given that we're going to try splitting up this
huge file, we need somewhere common for this code to live.
ShaderCore looks like the most appropriate place for this right now.

Bug: b/145336353
Change-Id: If4137875098fbeb761fdf63caf8d45011236cff1
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/38809
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 659389c..3ac96cc 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -566,4 +566,410 @@
 
 		return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
 	}
+
+
+	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
+	{
+		return rr::SignMask(ints) != 0;
+	}
+
+	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints)
+	{
+		return rr::SignMask(~ints) != 0;
+	}
+
+	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val)
+	{
+		return rr::As<sw::SIMD::Float>((rr::As<sw::SIMD::UInt>(val) & sw::SIMD::UInt(0x80000000)) | sw::SIMD::UInt(0x3f800000));
+	}
+
+	// Returns the <whole, frac> of val.
+	// Both whole and frac will have the same sign as val.
+	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
+	Modf(rr::RValue<sw::SIMD::Float> const &val)
+	{
+		auto abs = Abs(val);
+		auto sign = Sign(val);
+		auto whole = Floor(abs) * sign;
+		auto frac = Frac(abs) * sign;
+		return std::make_pair(whole, frac);
+	}
+
+	// Returns the number of 1s in bits, per lane.
+	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits)
+	{
+		// TODO: Add an intrinsic to reactor. Even if there isn't a
+		// single vector instruction, there may be target-dependent
+		// ways to make this faster.
+		// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+		sw::SIMD::UInt c = bits - ((bits >> 1) & sw::SIMD::UInt(0x55555555));
+		c = ((c >> 2) & sw::SIMD::UInt(0x33333333)) + (c & sw::SIMD::UInt(0x33333333));
+		c = ((c >> 4) + c) & sw::SIMD::UInt(0x0F0F0F0F);
+		c = ((c >> 8) + c) & sw::SIMD::UInt(0x00FF00FF);
+		c = ((c >> 16) + c) & sw::SIMD::UInt(0x0000FFFF);
+		return c;
+	}
+
+	// Returns 1 << bits.
+	// If the resulting bit overflows a 32 bit integer, 0 is returned.
+	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits)
+	{
+		return ((sw::SIMD::UInt(1) << bits) & rr::CmpLT(bits, sw::SIMD::UInt(32)));
+	}
+
+	// Returns bitCount number of of 1's starting from the LSB.
+	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount)
+	{
+		return NthBit32(bitCount) - sw::SIMD::UInt(1);
+	}
+
+	// Performs a fused-multiply add, returning a * b + c.
+	rr::RValue<sw::SIMD::Float> FMA(
+			rr::RValue<sw::SIMD::Float> const &a,
+			rr::RValue<sw::SIMD::Float> const &b,
+			rr::RValue<sw::SIMD::Float> const &c)
+	{
+		return a * b + c;
+	}
+
+	// Returns the exponent of the floating point number f.
+	// Assumes IEEE 754
+	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f)
+	{
+		auto v = rr::As<sw::SIMD::UInt>(f);
+		return (sw::SIMD::Int((v >> sw::SIMD::UInt(23)) & sw::SIMD::UInt(0xFF)) - sw::SIMD::Int(126));
+	}
+
+	// Returns y if y < x; otherwise result is x.
+	// If one operand is a NaN, the other operand is the result.
+	// If both operands are NaN, the result is a NaN.
+	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
+	{
+		using namespace rr;
+		auto xIsNan = IsNan(x);
+		auto yIsNan = IsNan(y);
+		return As<sw::SIMD::Float>(
+			// If neither are NaN, return min
+			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Min(x, y))) |
+			// If one operand is a NaN, the other operand is the result
+			// If both operands are NaN, the result is a NaN.
+			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
+			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
+	}
+
+	// Returns y if y > x; otherwise result is x.
+	// If one operand is a NaN, the other operand is the result.
+	// If both operands are NaN, the result is a NaN.
+	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
+	{
+		using namespace rr;
+		auto xIsNan = IsNan(x);
+		auto yIsNan = IsNan(y);
+		return As<sw::SIMD::Float>(
+			// If neither are NaN, return max
+			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Max(x, y))) |
+			// If one operand is a NaN, the other operand is the result
+			// If both operands are NaN, the result is a NaN.
+			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
+			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
+	}
+
+	// Returns the determinant of a 2x2 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
+	{
+		return a*d - b*c;
+	}
+
+	// Returns the determinant of a 3x3 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
+	{
+		return a*e*i + b*f*g + c*d*h - c*e*g - b*d*i - a*f*h;
+	}
+
+	// Returns the determinant of a 4x4 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
+	{
+		return a * Determinant(f, g, h,
+		                       j, k, l,
+		                       n, o, p) -
+		       b * Determinant(e, g, h,
+		                       i, k, l,
+		                       m, o, p) +
+		       c * Determinant(e, f, h,
+		                       i, j, l,
+		                       m, n, p) -
+		       d * Determinant(e, f, g,
+		                       i, j, k,
+		                       m, n, o);
+	}
+
+	// Returns the inverse of a 2x2 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
+	{
+		auto s = sw::SIMD::Float(1.0f) / Determinant(a, b, c, d);
+		return {{s*d, -s*b, -s*c, s*a}};
+	}
+
+	// Returns the inverse of a 3x3 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
+	{
+		auto s = sw::SIMD::Float(1.0f) / Determinant(
+				a, b, c,
+				d, e, f,
+				g, h, i); // TODO: duplicate arithmetic calculating the det and below.
+
+		return {{
+			s * (e*i - f*h), s * (c*h - b*i), s * (b*f - c*e),
+			s * (f*g - d*i), s * (a*i - c*g), s * (c*d - a*f),
+			s * (d*h - e*g), s * (b*g - a*h), s * (a*e - b*d),
+		}};
+	}
+
+	// Returns the inverse of a 4x4 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
+	{
+		auto s = sw::SIMD::Float(1.0f) / Determinant(
+				a, b, c, d,
+				e, f, g, h,
+				i, j, k, l,
+				m, n, o, p); // TODO: duplicate arithmetic calculating the det and below.
+
+		auto kplo = k*p - l*o, jpln = j*p - l*n, jokn = j*o - k*n;
+		auto gpho = g*p - h*o, fphn = f*p - h*n, fogn = f*o - g*n;
+		auto glhk = g*l - h*k, flhj = f*l - h*j, fkgj = f*k - g*j;
+		auto iplm = i*p - l*m, iokm = i*o - k*m, ephm = e*p - h*m;
+		auto eogm = e*o - g*m, elhi = e*l - h*i, ekgi = e*k - g*i;
+		auto injm = i*n - j*m, enfm = e*n - f*m, ejfi = e*j - f*i;
+
+		return {{
+			s * ( f * kplo - g * jpln + h * jokn),
+			s * (-b * kplo + c * jpln - d * jokn),
+			s * ( b * gpho - c * fphn + d * fogn),
+			s * (-b * glhk + c * flhj - d * fkgj),
+
+			s * (-e * kplo + g * iplm - h * iokm),
+			s * ( a * kplo - c * iplm + d * iokm),
+			s * (-a * gpho + c * ephm - d * eogm),
+			s * ( a * glhk - c * elhi + d * ekgi),
+
+			s * ( e * jpln - f * iplm + h * injm),
+			s * (-a * jpln + b * iplm - d * injm),
+			s * ( a * fphn - b * ephm + d * enfm),
+			s * (-a * flhj + b * elhi - d * ejfi),
+
+			s * (-e * jokn + f * iokm - g * injm),
+			s * ( a * jokn - b * iokm + c * injm),
+			s * (-a * fogn + b * eogm - c * enfm),
+			s * ( a * fkgj - b * ekgi + c * ejfi),
+		}};
+	}
+
+	namespace SIMD {
+
+		Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit)
+			: base(base),
+				dynamicLimit(limit), staticLimit(0),
+				dynamicOffsets(0), staticOffsets{},
+				hasDynamicLimit(true), hasDynamicOffsets(false) {}
+
+		Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit)
+			: base(base),
+				dynamicLimit(0), staticLimit(limit),
+				dynamicOffsets(0), staticOffsets{},
+				hasDynamicLimit(false), hasDynamicOffsets(false) {}
+
+		Pointer::Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
+			: base(base),
+				dynamicLimit(limit), staticLimit(0),
+				dynamicOffsets(offset), staticOffsets{},
+				hasDynamicLimit(true), hasDynamicOffsets(true) {}
+
+		Pointer::Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
+			: base(base),
+				dynamicLimit(0), staticLimit(limit),
+				dynamicOffsets(offset), staticOffsets{},
+				hasDynamicLimit(false), hasDynamicOffsets(true) {}
+
+		Pointer& Pointer::operator += (Int i)
+		{
+			dynamicOffsets += i;
+			hasDynamicOffsets = true;
+			return *this;
+		}
+
+		Pointer& Pointer::operator *= (Int i)
+		{
+			dynamicOffsets = offsets() * i;
+			staticOffsets = {};
+			hasDynamicOffsets = true;
+			return *this;
+		}
+
+		Pointer Pointer::operator + (SIMD::Int i) { Pointer p = *this; p += i; return p; }
+		Pointer Pointer::operator * (SIMD::Int i) { Pointer p = *this; p *= i; return p; }
+
+		Pointer& Pointer::operator += (int i)
+		{
+			for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
+			return *this;
+		}
+
+		Pointer& Pointer::operator *= (int i)
+		{
+			for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] *= i; }
+			if (hasDynamicOffsets)
+			{
+				dynamicOffsets *= SIMD::Int(i);
+			}
+			return *this;
+		}
+
+		Pointer Pointer::operator + (int i) { Pointer p = *this; p += i; return p; }
+		Pointer Pointer::operator * (int i) { Pointer p = *this; p *= i; return p; }
+
+		SIMD::Int Pointer::offsets() const
+		{
+			static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+			return dynamicOffsets + SIMD::Int(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
+		}
+
+		SIMD::Int Pointer::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+		{
+			ASSERT(accessSize > 0);
+
+			if (isStaticallyInBounds(accessSize, robustness))
+			{
+				return SIMD::Int(0xffffffff);
+			}
+
+			if (!hasDynamicOffsets && !hasDynamicLimit)
+			{
+				// Common fast paths.
+				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+				return SIMD::Int(
+					(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+					(staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+					(staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
+					(staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
+			}
+
+			return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
+		}
+
+		bool Pointer::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+		{
+			if (hasDynamicOffsets)
+			{
+				return false;
+			}
+
+			if (hasDynamicLimit)
+			{
+				if (hasStaticEqualOffsets() || hasStaticSequentialOffsets(accessSize))
+				{
+					switch(robustness)
+					{
+					case OutOfBoundsBehavior::UndefinedBehavior:
+						// With this robustness setting the application/compiler guarantees in-bounds accesses on active lanes,
+						// but since it can't know in advance which branches are taken this must be true even for inactives lanes.
+						return true;
+					case OutOfBoundsBehavior::Nullify:
+					case OutOfBoundsBehavior::RobustBufferAccess:
+					case OutOfBoundsBehavior::UndefinedValue:
+						return false;
+					}
+				}
+			}
+
+			for (int i = 0; i < SIMD::Width; i++)
+			{
+				if (staticOffsets[i] + accessSize - 1 >= staticLimit)
+				{
+					return false;
+				}
+			}
+
+			return true;
+		}
+
+		Int Pointer::limit() const
+		{
+			return dynamicLimit + staticLimit;
+		}
+
+		// Returns true if all offsets are sequential
+		// (N+0*step, N+1*step, N+2*step, N+3*step)
+		rr::Bool Pointer::hasSequentialOffsets(unsigned int step) const
+		{
+			if (hasDynamicOffsets)
+			{
+				auto o = offsets();
+				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+				return rr::SignMask(~CmpEQ(o.yzww, o + SIMD::Int(1*step, 2*step, 3*step, 0))) == 0;
+			}
+			return hasStaticSequentialOffsets(step);
+		}
+
+		// Returns true if all offsets are are compile-time static and
+		// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
+		bool Pointer::hasStaticSequentialOffsets(unsigned int step) const
+		{
+			if (hasDynamicOffsets)
+			{
+				return false;
+			}
+			for (int i = 1; i < SIMD::Width; i++)
+			{
+				if (staticOffsets[i-1] + int32_t(step) != staticOffsets[i]) { return false; }
+			}
+			return true;
+		}
+
+		// Returns true if all offsets are equal (N, N, N, N)
+		rr::Bool Pointer::hasEqualOffsets() const
+		{
+			if (hasDynamicOffsets)
+			{
+				auto o = offsets();
+				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
+				return rr::SignMask(~CmpEQ(o, o.yzwx)) == 0;
+			}
+			return hasStaticEqualOffsets();
+		}
+
+		// Returns true if all offsets are compile-time static and are equal
+		// (N, N, N, N)
+		bool Pointer::hasStaticEqualOffsets() const
+		{
+			if (hasDynamicOffsets)
+			{
+				return false;
+			}
+			for (int i = 1; i < SIMD::Width; i++)
+			{
+				if (staticOffsets[i-1] != staticOffsets[i]) { return false; }
+			}
+			return true;
+		}
+
+	}  // namespace SIMD
+
 }
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index aa707d5..e95d4d0 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -19,6 +19,10 @@
 #include "Reactor/Reactor.hpp"
 #include "Vulkan/VkDebug.hpp"
 
+#include <atomic> // std::memory_order
+#include <array>
+#include <utility> // std::pair
+
 namespace sw
 {
 	using namespace rr;
@@ -55,6 +59,100 @@
 		Float4 w;
 	};
 
+	enum class OutOfBoundsBehavior
+	{
+		Nullify,             // Loads become zero, stores are elided.
+		RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
+		UndefinedValue,      // Only for load operations. Not secure. No program termination.
+		UndefinedBehavior,   // Program may terminate.
+	};
+
+	// SIMD contains types that represent multiple scalars packed into a single
+	// vector data type. Types in the SIMD namespace provide a semantic hint
+	// that the data should be treated as a per-execution-lane scalar instead of
+	// a typical euclidean-style vector type.
+	namespace SIMD
+	{
+		// Width is the number of per-lane scalars packed into each SIMD vector.
+		static constexpr int Width = 4;
+
+		using Float = rr::Float4;
+		using Int = rr::Int4;
+		using UInt = rr::UInt4;
+
+		struct Pointer
+		{
+			Pointer(rr::Pointer<Byte> base, rr::Int limit);
+			Pointer(rr::Pointer<Byte> base, unsigned int limit);
+			Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
+			Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
+
+			Pointer& operator += (Int i);
+			Pointer& operator *= (Int i);
+
+			Pointer operator + (SIMD::Int i);
+			Pointer operator * (SIMD::Int i);
+
+			Pointer& operator += (int i);
+			Pointer& operator *= (int i);
+
+			Pointer operator + (int i);
+			Pointer operator * (int i);
+
+			SIMD::Int offsets() const;
+
+			SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+
+			bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+
+			Int limit() const;
+
+			// Returns true if all offsets are sequential
+			// (N+0*step, N+1*step, N+2*step, N+3*step)
+			rr::Bool hasSequentialOffsets(unsigned int step) const;
+
+			// Returns true if all offsets are are compile-time static and
+			// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
+			bool hasStaticSequentialOffsets(unsigned int step) const;
+
+			// Returns true if all offsets are equal (N, N, N, N)
+			rr::Bool hasEqualOffsets() const;
+
+			// Returns true if all offsets are compile-time static and are equal
+			// (N, N, N, N)
+			bool hasStaticEqualOffsets() const;
+
+			template<typename T>
+			inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+
+			template<typename T>
+			inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+			template<typename T>
+			inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+			// Base address for the pointer, common across all lanes.
+			rr::Pointer<rr::Byte> base;
+
+			// Upper (non-inclusive) limit for offsets from base.
+			rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
+			unsigned int staticLimit;
+
+			// Per lane offsets from base.
+			SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
+			std::array<int32_t, SIMD::Width> staticOffsets;
+
+			bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
+			bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
+		};
+
+		template <typename T> struct Element {};
+		template <> struct Element<Float> { using type = rr::Float; };
+		template <> struct Element<Int>   { using type = rr::Int; };
+		template <> struct Element<UInt>  { using type = rr::UInt; };
+
+	} // namespace SIMD
+
 	Float4 exponential2(RValue<Float4> x, bool pp = false);
 	Float4 logarithm2(RValue<Float4> x, bool pp = false);
 	Float4 exponential(RValue<Float4> x, bool pp = false);
@@ -93,7 +191,305 @@
 	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
 
 	UInt4 halfToFloatBits(UInt4 halfBits);
-}
+
+	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
+
+	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
+
+	template <typename T>
+	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
+
+	template <typename T>
+	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
+
+	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
+
+	// Returns the <whole, frac> of val.
+	// Both whole and frac will have the same sign as val.
+	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
+	Modf(rr::RValue<sw::SIMD::Float> const &val);
+
+	// Returns the number of 1s in bits, per lane.
+	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
+
+	// Returns 1 << bits.
+	// If the resulting bit overflows a 32 bit integer, 0 is returned.
+	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
+
+	// Returns bitCount number of of 1's starting from the LSB.
+	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
+
+	// Performs a fused-multiply add, returning a * b + c.
+	rr::RValue<sw::SIMD::Float> FMA(
+			rr::RValue<sw::SIMD::Float> const &a,
+			rr::RValue<sw::SIMD::Float> const &b,
+			rr::RValue<sw::SIMD::Float> const &c);
+
+	// Returns the exponent of the floating point number f.
+	// Assumes IEEE 754
+	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
+
+	// Returns y if y < x; otherwise result is x.
+	// If one operand is a NaN, the other operand is the result.
+	// If both operands are NaN, the result is a NaN.
+	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+	// Returns y if y > x; otherwise result is x.
+	// If one operand is a NaN, the other operand is the result.
+	// If both operands are NaN, the result is a NaN.
+	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+	// Returns the determinant of a 2x2 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+	// Returns the determinant of a 3x3 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+	// Returns the determinant of a 4x4 matrix.
+	rr::RValue<sw::SIMD::Float> Determinant(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+	// Returns the inverse of a 2x2 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+	// Returns the inverse of a 3x3 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+	// Returns the inverse of a 4x4 matrix.
+	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
+		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+	////////////////////////////////////////////////////////////////////////////
+	// Inline functions
+	////////////////////////////////////////////////////////////////////////////
+
+	template<typename T>
+	inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+	{
+		using EL = typename Element<T>::type;
+
+		if (isStaticallyInBounds(sizeof(float), robustness))
+		{
+			// All elements are statically known to be in-bounds.
+			// We can avoid costly conditional on masks.
+
+			if (hasStaticSequentialOffsets(sizeof(float)))
+			{
+				// Offsets are sequential. Perform regular load.
+				return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
+			}
+			if (hasStaticEqualOffsets())
+			{
+				// Load one, replicate.
+				return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
+			}
+		}
+		else
+		{
+			switch(robustness)
+			{
+			case OutOfBoundsBehavior::Nullify:
+			case OutOfBoundsBehavior::RobustBufferAccess:
+			case OutOfBoundsBehavior::UndefinedValue:
+				mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
+				break;
+			case OutOfBoundsBehavior::UndefinedBehavior:
+				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+				break;
+			}
+		}
+
+		auto offs = offsets();
+
+		if (!atomic && order == std::memory_order_relaxed)
+		{
+			if (hasStaticEqualOffsets())
+			{
+				// Load one, replicate.
+				// Be careful of the case where the post-bounds-check mask
+				// is 0, in which case we must not load.
+				T out = T(0);
+				If(AnyTrue(mask))
+				{
+					EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
+					out = T(el);
+				}
+				return out;
+			}
+
+			bool zeroMaskedLanes = true;
+			switch(robustness)
+			{
+			case OutOfBoundsBehavior::Nullify:
+			case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
+				zeroMaskedLanes = true;
+				break;
+			case OutOfBoundsBehavior::UndefinedValue:
+			case OutOfBoundsBehavior::UndefinedBehavior:
+				zeroMaskedLanes = false;
+				break;
+			}
+
+			if (hasStaticSequentialOffsets(sizeof(float)))
+			{
+				return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
+			}
+
+			return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
+		}
+		else
+		{
+			T out;
+			auto anyLanesDisabled = AnyFalse(mask);
+			If(hasEqualOffsets() && !anyLanesDisabled)
+			{
+				// Load one, replicate.
+				auto offset = Extract(offs, 0);
+				out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
+			}
+			Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+			{
+				// Load all elements in a single SIMD instruction.
+				auto offset = Extract(offs, 0);
+				out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+			}
+			Else
+			{
+				// Divergent offsets or masked lanes.
+				out = T(0);
+				for (int i = 0; i < SIMD::Width; i++)
+				{
+					If(Extract(mask, i) != 0)
+					{
+						auto offset = Extract(offs, i);
+						auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
+						out = Insert(out, el, i);
+					}
+				}
+			}
+			return out;
+		}
+	}
+
+	template<typename T>
+	inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+	{
+		using EL = typename Element<T>::type;
+		constexpr size_t alignment = sizeof(float);
+		auto offs = offsets();
+
+		switch(robustness)
+		{
+		case OutOfBoundsBehavior::Nullify:
+		case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
+		case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
+			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+			break;
+		case OutOfBoundsBehavior::UndefinedBehavior:
+			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+			break;
+		}
+
+		if (!atomic && order == std::memory_order_relaxed)
+		{
+			if (hasStaticEqualOffsets())
+			{
+				If (AnyTrue(mask))
+				{
+					// All equal. One of these writes will win -- elect the winning lane.
+					auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+					auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
+					auto maskedVal = As<SIMD::Int>(val) & elect;
+					auto scalarVal = Extract(maskedVal, 0) |
+						Extract(maskedVal, 1) |
+						Extract(maskedVal, 2) |
+						Extract(maskedVal, 3);
+					*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+				}
+			}
+			else if (hasStaticSequentialOffsets(sizeof(float)))
+			{
+				if (isStaticallyInBounds(sizeof(float), robustness))
+				{
+					// Pointer has no elements OOB, and the store is not atomic.
+					// Perform a RMW.
+					auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
+					auto prev = *p;
+					*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
+				}
+				else
+				{
+					rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
+				}
+			}
+			else
+			{
+				rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
+			}
+		}
+		else
+		{
+			auto anyLanesDisabled = AnyFalse(mask);
+			If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+			{
+				// Store all elements in a single SIMD instruction.
+				auto offset = Extract(offs, 0);
+				rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+			}
+			Else
+			{
+				// Divergent offsets or masked lanes.
+				for (int i = 0; i < SIMD::Width; i++)
+				{
+					If(Extract(mask, i) != 0)
+					{
+						auto offset = Extract(offs, i);
+						rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
+					}
+				}
+			}
+		}
+	}
+
+	template<typename T>
+	inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+	{
+		Store(T(val), robustness, mask, atomic, order);
+	}
+
+	template <typename T>
+	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
+	{
+		T v1 = mask;              // [x]    [y]    [z]    [w]
+		T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+		return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+	}
+
+	template <typename T>
+	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
+	{
+		T v1 = mask;              // [x]    [y]    [z]    [w]
+		T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+		return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+	}
+
+} // namespace sw
 
 #ifdef ENABLE_RR_PRINT
 namespace rr {
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 3410902..441ff43 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -35,247 +35,6 @@
 {
 	constexpr float PI = 3.141592653589793f;
 
-	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
-	{
-		return rr::SignMask(ints) != 0;
-	}
-
-	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints)
-	{
-		return rr::SignMask(~ints) != 0;
-	}
-
-	template <typename T>
-	rr::RValue<T> AndAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
-
-	template <typename T>
-	rr::RValue<T> OrAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
-
-	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val)
-	{
-		return rr::As<sw::SIMD::Float>((rr::As<sw::SIMD::UInt>(val) & sw::SIMD::UInt(0x80000000)) | sw::SIMD::UInt(0x3f800000));
-	}
-
-	// Returns the <whole, frac> of val.
-	// Both whole and frac will have the same sign as val.
-	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
-	Modf(rr::RValue<sw::SIMD::Float> const &val)
-	{
-		auto abs = Abs(val);
-		auto sign = Sign(val);
-		auto whole = Floor(abs) * sign;
-		auto frac = Frac(abs) * sign;
-		return std::make_pair(whole, frac);
-	}
-
-	// Returns the number of 1s in bits, per lane.
-	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits)
-	{
-		// TODO: Add an intrinsic to reactor. Even if there isn't a
-		// single vector instruction, there may be target-dependent
-		// ways to make this faster.
-		// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-		sw::SIMD::UInt c = bits - ((bits >> 1) & sw::SIMD::UInt(0x55555555));
-		c = ((c >> 2) & sw::SIMD::UInt(0x33333333)) + (c & sw::SIMD::UInt(0x33333333));
-		c = ((c >> 4) + c) & sw::SIMD::UInt(0x0F0F0F0F);
-		c = ((c >> 8) + c) & sw::SIMD::UInt(0x00FF00FF);
-		c = ((c >> 16) + c) & sw::SIMD::UInt(0x0000FFFF);
-		return c;
-	}
-
-	// Returns 1 << bits.
-	// If the resulting bit overflows a 32 bit integer, 0 is returned.
-	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits)
-	{
-		return ((sw::SIMD::UInt(1) << bits) & rr::CmpLT(bits, sw::SIMD::UInt(32)));
-	}
-
-	// Returns bitCount number of of 1's starting from the LSB.
-	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount)
-	{
-		return NthBit32(bitCount) - sw::SIMD::UInt(1);
-	}
-
-	// Performs a fused-multiply add, returning a * b + c.
-	rr::RValue<sw::SIMD::Float> FMA(
-			rr::RValue<sw::SIMD::Float> const &a,
-			rr::RValue<sw::SIMD::Float> const &b,
-			rr::RValue<sw::SIMD::Float> const &c)
-	{
-		return a * b + c;
-	}
-
-	// Returns the exponent of the floating point number f.
-	// Assumes IEEE 754
-	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f)
-	{
-		auto v = rr::As<sw::SIMD::UInt>(f);
-		return (sw::SIMD::Int((v >> sw::SIMD::UInt(23)) & sw::SIMD::UInt(0xFF)) - sw::SIMD::Int(126));
-	}
-
-	// Returns y if y < x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
-	{
-		using namespace rr;
-		auto xIsNan = IsNan(x);
-		auto yIsNan = IsNan(y);
-		return As<sw::SIMD::Float>(
-			// If neither are NaN, return min
-			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Min(x, y))) |
-			// If one operand is a NaN, the other operand is the result
-			// If both operands are NaN, the result is a NaN.
-			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
-			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
-	}
-
-	// Returns y if y > x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y)
-	{
-		using namespace rr;
-		auto xIsNan = IsNan(x);
-		auto yIsNan = IsNan(y);
-		return As<sw::SIMD::Float>(
-			// If neither are NaN, return max
-			((~xIsNan & ~yIsNan) & As<sw::SIMD::Int>(Max(x, y))) |
-			// If one operand is a NaN, the other operand is the result
-			// If both operands are NaN, the result is a NaN.
-			((~xIsNan &  yIsNan) & As<sw::SIMD::Int>(x)) |
-			(( xIsNan          ) & As<sw::SIMD::Int>(y)));
-	}
-
-	// Returns the determinant of a 2x2 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
-	{
-		return a*d - b*c;
-	}
-
-	// Returns the determinant of a 3x3 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
-	{
-		return a*e*i + b*f*g + c*d*h - c*e*g - b*d*i - a*f*h;
-	}
-
-	// Returns the determinant of a 4x4 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
-	{
-		return a * Determinant(f, g, h,
-		                       j, k, l,
-		                       n, o, p) -
-		       b * Determinant(e, g, h,
-		                       i, k, l,
-		                       m, o, p) +
-		       c * Determinant(e, f, h,
-		                       i, j, l,
-		                       m, n, p) -
-		       d * Determinant(e, f, g,
-		                       i, j, k,
-		                       m, n, o);
-	}
-
-	// Returns the inverse of a 2x2 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(a, b, c, d);
-		return {{s*d, -s*b, -s*c, s*a}};
-	}
-
-	// Returns the inverse of a 3x3 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(
-				a, b, c,
-				d, e, f,
-				g, h, i); // TODO: duplicate arithmetic calculating the det and below.
-
-		return {{
-			s * (e*i - f*h), s * (c*h - b*i), s * (b*f - c*e),
-			s * (f*g - d*i), s * (a*i - c*g), s * (c*d - a*f),
-			s * (d*h - e*g), s * (b*g - a*h), s * (a*e - b*d),
-		}};
-	}
-
-	// Returns the inverse of a 4x4 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p)
-	{
-		auto s = sw::SIMD::Float(1.0f) / Determinant(
-				a, b, c, d,
-				e, f, g, h,
-				i, j, k, l,
-				m, n, o, p); // TODO: duplicate arithmetic calculating the det and below.
-
-		auto kplo = k*p - l*o, jpln = j*p - l*n, jokn = j*o - k*n;
-		auto gpho = g*p - h*o, fphn = f*p - h*n, fogn = f*o - g*n;
-		auto glhk = g*l - h*k, flhj = f*l - h*j, fkgj = f*k - g*j;
-		auto iplm = i*p - l*m, iokm = i*o - k*m, ephm = e*p - h*m;
-		auto eogm = e*o - g*m, elhi = e*l - h*i, ekgi = e*k - g*i;
-		auto injm = i*n - j*m, enfm = e*n - f*m, ejfi = e*j - f*i;
-
-		return {{
-			s * ( f * kplo - g * jpln + h * jokn),
-			s * (-b * kplo + c * jpln - d * jokn),
-			s * ( b * gpho - c * fphn + d * fogn),
-			s * (-b * glhk + c * flhj - d * fkgj),
-
-			s * (-e * kplo + g * iplm - h * iokm),
-			s * ( a * kplo - c * iplm + d * iokm),
-			s * (-a * gpho + c * ephm - d * eogm),
-			s * ( a * glhk - c * elhi + d * ekgi),
-
-			s * ( e * jpln - f * iplm + h * injm),
-			s * (-a * jpln + b * iplm - d * injm),
-			s * ( a * fphn - b * ephm + d * enfm),
-			s * (-a * flhj + b * elhi - d * ejfi),
-
-			s * (-e * jokn + f * iokm - g * injm),
-			s * ( a * jokn - b * iokm + c * injm),
-			s * (-a * fogn + b * eogm - c * enfm),
-			s * ( a * fkgj - b * ekgi + c * ejfi),
-		}};
-	}
-
-
-	sw::SIMD::Pointer interleaveByLane(sw::SIMD::Pointer p)
-	{
-		p *= sw::SIMD::Width;
-		p.staticOffsets[0] += 0 * sizeof(float);
-		p.staticOffsets[1] += 1 * sizeof(float);
-		p.staticOffsets[2] += 2 * sizeof(float);
-		p.staticOffsets[3] += 3 * sizeof(float);
-		return p;
-	}
-
 	VkFormat SpirvFormatToVulkanFormat(spv::ImageFormat format)
 	{
 		switch (format)
@@ -317,198 +76,6 @@
 
 namespace sw
 {
-	namespace SIMD
-	{
-
-		template<typename T>
-		T Load(Pointer ptr, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
-		{
-			using EL = typename Element<T>::type;
-
-			if (ptr.isStaticallyInBounds(sizeof(float), robustness))
-			{
-				// All elements are statically known to be in-bounds.
-				// We can avoid costly conditional on masks.
-
-				if (ptr.hasStaticSequentialOffsets(sizeof(float)))
-				{
-					// Offsets are sequential. Perform regular load.
-					return rr::Load(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), alignment, atomic, order);
-				}
-				if (ptr.hasStaticEqualOffsets())
-				{
-					// Load one, replicate.
-					return T(*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment));
-				}
-			}
-			else
-			{
-				switch(robustness)
-				{
-				case OutOfBoundsBehavior::Nullify:
-				case OutOfBoundsBehavior::RobustBufferAccess:
-				case OutOfBoundsBehavior::UndefinedValue:
-					mask &= ptr.isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
-					break;
-				case OutOfBoundsBehavior::UndefinedBehavior:
-					// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
-					break;
-				}
-			}
-
-			auto offsets = ptr.offsets();
-
-			if (!atomic && order == std::memory_order_relaxed)
-			{
-				if (ptr.hasStaticEqualOffsets())
-				{
-					// Load one, replicate.
-					// Be careful of the case where the post-bounds-check mask
-					// is 0, in which case we must not load.
-					T out = T(0);
-					If(AnyTrue(mask))
-					{
-						EL el = *rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment);
-						out = T(el);
-					}
-					return out;
-				}
-
-				bool zeroMaskedLanes = true;
-				switch(robustness)
-				{
-				case OutOfBoundsBehavior::Nullify:
-				case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
-					zeroMaskedLanes = true;
-					break;
-				case OutOfBoundsBehavior::UndefinedValue:
-				case OutOfBoundsBehavior::UndefinedBehavior:
-					zeroMaskedLanes = false;
-					break;
-				}
-
-				if (ptr.hasStaticSequentialOffsets(sizeof(float)))
-				{
-					return rr::MaskedLoad(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), mask, alignment, zeroMaskedLanes);
-				}
-
-				return rr::Gather(rr::Pointer<EL>(ptr.base), offsets, mask, alignment, zeroMaskedLanes);
-			}
-			else
-			{
-				T out;
-				auto anyLanesDisabled = AnyFalse(mask);
-				If(ptr.hasEqualOffsets() && !anyLanesDisabled)
-				{
-					// Load one, replicate.
-					auto offset = Extract(offsets, 0);
-					out = T(rr::Load(rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order));
-				}
-				Else If(ptr.hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
-				{
-					// Load all elements in a single SIMD instruction.
-					auto offset = Extract(offsets, 0);
-					out = rr::Load(rr::Pointer<T>(&ptr.base[offset]), alignment, atomic, order);
-				}
-				Else
-				{
-					// Divergent offsets or masked lanes.
-					out = T(0);
-					for (int i = 0; i < SIMD::Width; i++)
-					{
-						If(Extract(mask, i) != 0)
-						{
-							auto offset = Extract(offsets, i);
-							auto el = rr::Load(rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order);
-							out = Insert(out, el, i);
-						}
-					}
-				}
-				return out;
-			}
-		}
-
-		template<typename T>
-		void Store(Pointer ptr, T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
-		{
-			using EL = typename Element<T>::type;
-			constexpr size_t alignment = sizeof(float);
-			auto offsets = ptr.offsets();
-
-			switch(robustness)
-			{
-			case OutOfBoundsBehavior::Nullify:
-			case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
-			case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
-				mask &= ptr.isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
-				break;
-			case OutOfBoundsBehavior::UndefinedBehavior:
-				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
-				break;
-			}
-
-			if (!atomic && order == std::memory_order_relaxed)
-			{
-				if (ptr.hasStaticEqualOffsets())
-				{
-					If (AnyTrue(mask))
-					{
-						// All equal. One of these writes will win -- elect the winning lane.
-						auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-						auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
-						auto maskedVal = As<SIMD::Int>(val) & elect;
-						auto scalarVal = Extract(maskedVal, 0) |
-							Extract(maskedVal, 1) |
-							Extract(maskedVal, 2) |
-							Extract(maskedVal, 3);
-						*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], alignment) = As<EL>(scalarVal);
-					}
-				}
-				else if (ptr.hasStaticSequentialOffsets(sizeof(float)))
-				{
-					if (ptr.isStaticallyInBounds(sizeof(float), robustness))
-					{
-						// Pointer has no elements OOB, and the store is not atomic.
-						// Perform a RMW.
-						auto p = rr::Pointer<SIMD::Int>(ptr.base + ptr.staticOffsets[0], alignment);
-						auto prev = *p;
-						*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
-					}
-					else
-					{
-						rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, alignment);
-					}
-				}
-				else
-				{
-					rr::Scatter(rr::Pointer<EL>(ptr.base), val, offsets, mask, alignment);
-				}
-			}
-			else
-			{
-				auto anyLanesDisabled = AnyFalse(mask);
-				If(ptr.hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
-				{
-					// Store all elements in a single SIMD instruction.
-					auto offset = Extract(offsets, 0);
-					Store(val, rr::Pointer<T>(&ptr.base[offset]), alignment, atomic, order);
-				}
-				Else
-				{
-					// Divergent offsets or masked lanes.
-					for (int i = 0; i < SIMD::Width; i++)
-					{
-						If(Extract(mask, i) != 0)
-						{
-							auto offset = Extract(offsets, i);
-							rr::Store(Extract(val, i), rr::Pointer<EL>(&ptr.base[offset]), alignment, atomic, order);
-						}
-					}
-				}
-			}
-		}
-
-	} // namespace SIMD
 
 	SpirvShader::SpirvShader(
 			uint32_t codeSerialID,
@@ -1398,6 +965,16 @@
 		}
 	}
 
+	sw::SIMD::Pointer SpirvShader::InterleaveByLane(sw::SIMD::Pointer p)
+	{
+		p *= sw::SIMD::Width;
+		p.staticOffsets[0] += 0 * sizeof(float);
+		p.staticOffsets[1] += 1 * sizeof(float);
+		p.staticOffsets[2] += 2 * sizeof(float);
+		p.staticOffsets[3] += 3 * sizeof(float);
+		return p;
+	}
+
 	bool SpirvShader::IsStorageInterleavedByLane(spv::StorageClass storageClass)
 	{
 		switch (storageClass)
@@ -2923,9 +2500,9 @@
 				VisitMemoryObject(resultId, [&](uint32_t i, uint32_t offset)
 				{
 					auto p = ptr + offset;
-					if (interleavedByLane) { p = interleaveByLane(p); }
+					if (interleavedByLane) { p = InterleaveByLane(p); }
 					auto robustness = OutOfBoundsBehavior::UndefinedBehavior;  // Local variables are always within bounds.
-					SIMD::Store(p, initialValue.Float(i), robustness, state->activeLaneMask());
+					p.Store(initialValue.Float(i), robustness, state->activeLaneMask());
 				});
 				break;
 			}
@@ -2975,8 +2552,8 @@
 		VisitMemoryObject(pointerId, [&](uint32_t i, uint32_t offset)
 		{
 			auto p = ptr + offset;
-			if (interleavedByLane) { p = interleaveByLane(p); }  // TODO: Interleave once, then add offset?
-			dst.move(i, SIMD::Load<SIMD::Float>(p, robustness, state->activeLaneMask(), atomic, memoryOrder));
+			if (interleavedByLane) { p = InterleaveByLane(p); }  // TODO: Interleave once, then add offset?
+			dst.move(i, p.Load<SIMD::Float>(robustness, state->activeLaneMask(), atomic, memoryOrder));
 		});
 
 		return EmitResult::Continue;
@@ -3019,8 +2596,8 @@
 			VisitMemoryObject(pointerId, [&](uint32_t i, uint32_t offset)
 			{
 				auto p = ptr + offset;
-				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, SIMD::Int(src[i]), robustness, mask, atomic, memoryOrder);
+				if (interleavedByLane) { p = InterleaveByLane(p); }
+				p.Store(SIMD::Int(src[i]), robustness, mask, atomic, memoryOrder);
 			});
 		}
 		else
@@ -3030,8 +2607,8 @@
 			VisitMemoryObject(pointerId, [&](uint32_t i, uint32_t offset)
 			{
 				auto p = ptr + offset;
-				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, src.Float(i), robustness, mask, atomic, memoryOrder);
+				if (interleavedByLane) { p = InterleaveByLane(p); }
+				p.Store(src.Float(i), robustness, mask, atomic, memoryOrder);
 			});
 		}
 
@@ -4079,8 +3656,8 @@
 				std::tie(whole, frac) = Modf(val.Float(i));
 				dst.move(i, frac);
 				auto p = ptr + (i * sizeof(float));
-				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, whole, robustness, state->activeLaneMask());
+				if (interleavedByLane) { p = InterleaveByLane(p); }
+				p.Store(whole, robustness, state->activeLaneMask());
 			}
 			break;
 		}
@@ -4220,8 +3797,8 @@
 				dst.move(i, significand);
 
 				auto p = ptr + (i * sizeof(float));
-				if (interleavedByLane) { p = interleaveByLane(p); }
-				SIMD::Store(p, exponent, robustness, state->activeLaneMask());
+				if (interleavedByLane) { p = InterleaveByLane(p); }
+				p.Store(exponent, robustness, state->activeLaneMask());
 			}
 			break;
 		}
@@ -5496,7 +5073,7 @@
 		// TODO: specialize for small formats?
 		for (auto i = 0; i < (texelSize + 3)/4; i++)
 		{
-			packed[i] = SIMD::Load<SIMD::Int>(texelPtr, robustness, state->activeLaneMask(), false, std::memory_order_relaxed, std::min(texelSize, 4));
+			packed[i] = texelPtr.Load<SIMD::Int>(robustness, state->activeLaneMask(), false, std::memory_order_relaxed, std::min(texelSize, 4));
 			texelPtr += sizeof(float);
 		}
 
@@ -5849,7 +5426,7 @@
 
 		for (auto i = 0u; i < numPackedElements; i++)
 		{
-			SIMD::Store(texelPtr, packed[i], robustness, state->activeLaneMask());
+			texelPtr.Store(packed[i], robustness, state->activeLaneMask());
 			texelPtr += sizeof(float);
 		}
 
@@ -6037,14 +5614,14 @@
 
 			auto dst = dstPtr + dstOffset;
 			auto src = srcPtr + srcOffset;
-			if (dstInterleavedByLane) { dst = interleaveByLane(dst); }
-			if (srcInterleavedByLane) { src = interleaveByLane(src); }
+			if (dstInterleavedByLane) { dst = InterleaveByLane(dst); }
+			if (srcInterleavedByLane) { src = InterleaveByLane(src); }
 
 			// TODO(b/131224163): Optimize based on src/dst storage classes.
 			auto robustness = OutOfBoundsBehavior::RobustBufferAccess;
 
-			auto value = SIMD::Load<SIMD::Float>(src, robustness, state->activeLaneMask());
-			SIMD::Store(dst, value, robustness, state->activeLaneMask());
+			auto value = src.Load<SIMD::Float>(robustness, state->activeLaneMask());
+			dst.Store(value, robustness, state->activeLaneMask());
 		});
 		return EmitResult::Continue;
 	}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 34565cb..4889c57 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -57,248 +57,6 @@
 	// Forward declarations.
 	class SpirvRoutine;
 
-	enum class OutOfBoundsBehavior
-	{
-		Nullify,             // Loads become zero, stores are elided.
-		RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
-		UndefinedValue,      // Only for load operations. Not secure. No program termination.
-		UndefinedBehavior,   // Program may terminate.
-	};
-
-	// SIMD contains types that represent multiple scalars packed into a single
-	// vector data type. Types in the SIMD namespace provide a semantic hint
-	// that the data should be treated as a per-execution-lane scalar instead of
-	// a typical euclidean-style vector type.
-	namespace SIMD
-	{
-		// Width is the number of per-lane scalars packed into each SIMD vector.
-		static constexpr int Width = 4;
-
-		using Float = rr::Float4;
-		using Int = rr::Int4;
-		using UInt = rr::UInt4;
-
-		struct Pointer
-		{
-			Pointer(rr::Pointer<Byte> base, rr::Int limit)
-				: base(base),
-				  dynamicLimit(limit), staticLimit(0),
-				  dynamicOffsets(0), staticOffsets{},
-				  hasDynamicLimit(true), hasDynamicOffsets(false) {}
-
-			Pointer(rr::Pointer<Byte> base, unsigned int limit)
-				: base(base),
-				  dynamicLimit(0), staticLimit(limit),
-				  dynamicOffsets(0), staticOffsets{},
-				  hasDynamicLimit(false), hasDynamicOffsets(false) {}
-
-			Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
-				: base(base),
-				  dynamicLimit(limit), staticLimit(0),
-				  dynamicOffsets(offset), staticOffsets{},
-				  hasDynamicLimit(true), hasDynamicOffsets(true) {}
-
-			Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
-				: base(base),
-				  dynamicLimit(0), staticLimit(limit),
-				  dynamicOffsets(offset), staticOffsets{},
-				  hasDynamicLimit(false), hasDynamicOffsets(true) {}
-
-			inline Pointer& operator += (Int i)
-			{
-				dynamicOffsets += i;
-				hasDynamicOffsets = true;
-				return *this;
-			}
-
-			inline Pointer& operator *= (Int i)
-			{
-				dynamicOffsets = offsets() * i;
-				staticOffsets = {};
-				hasDynamicOffsets = true;
-				return *this;
-			}
-
-			inline Pointer operator + (SIMD::Int i) { Pointer p = *this; p += i; return p; }
-			inline Pointer operator * (SIMD::Int i) { Pointer p = *this; p *= i; return p; }
-
-			inline Pointer& operator += (int i)
-			{
-				for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
-				return *this;
-			}
-
-			inline Pointer& operator *= (int i)
-			{
-				for (int el = 0; el < SIMD::Width; el++) { staticOffsets[el] *= i; }
-				if (hasDynamicOffsets)
-				{
-					dynamicOffsets *= SIMD::Int(i);
-				}
-				return *this;
-			}
-
-			inline Pointer operator + (int i) { Pointer p = *this; p += i; return p; }
-			inline Pointer operator * (int i) { Pointer p = *this; p *= i; return p; }
-
-			inline SIMD::Int offsets() const
-			{
-				static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-				return dynamicOffsets + SIMD::Int(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
-			}
-
-			inline SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
-			{
-				ASSERT(accessSize > 0);
-
-				if (isStaticallyInBounds(accessSize, robustness))
-				{
-					return SIMD::Int(0xffffffff);
-				}
-
-				if (!hasDynamicOffsets && !hasDynamicLimit)
-				{
-					// Common fast paths.
-					static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-					return SIMD::Int(
-						(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-						(staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-						(staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xffffffff : 0,
-						(staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xffffffff : 0);
-				}
-
-				return CmpLT(offsets() + SIMD::Int(accessSize - 1), SIMD::Int(limit()));
-			}
-
-			inline bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
-			{
-				if (hasDynamicOffsets)
-				{
-					return false;
-				}
-
-				if (hasDynamicLimit)
-				{
-					if (hasStaticEqualOffsets() || hasStaticSequentialOffsets(accessSize))
-					{
-						switch(robustness)
-						{
-						case OutOfBoundsBehavior::UndefinedBehavior:
-							// With this robustness setting the application/compiler guarantees in-bounds accesses on active lanes,
-							// but since it can't know in advance which branches are taken this must be true even for inactives lanes.
-							return true;
-						case OutOfBoundsBehavior::Nullify:
-						case OutOfBoundsBehavior::RobustBufferAccess:
-						case OutOfBoundsBehavior::UndefinedValue:
-							return false;
-						}
-					}
-				}
-
-				for (int i = 0; i < SIMD::Width; i++)
-				{
-					if (staticOffsets[i] + accessSize - 1 >= staticLimit)
-					{
-						return false;
-					}
-				}
-
-				return true;
-			}
-
-			inline Int limit() const
-			{
-				return dynamicLimit + staticLimit;
-			}
-
-			// Returns true if all offsets are sequential
-			// (N+0*step, N+1*step, N+2*step, N+3*step)
-			inline rr::Bool hasSequentialOffsets(unsigned int step) const
-			{
-				if (hasDynamicOffsets)
-				{
-					auto o = offsets();
-					static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-					return rr::SignMask(~CmpEQ(o.yzww, o + SIMD::Int(1*step, 2*step, 3*step, 0))) == 0;
-				}
-				return hasStaticSequentialOffsets(step);
-			}
-
-			// Returns true if all offsets are are compile-time static and
-			// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
-			inline bool hasStaticSequentialOffsets(unsigned int step) const
-			{
-				if (hasDynamicOffsets)
-				{
-					return false;
-				}
-				for (int i = 1; i < SIMD::Width; i++)
-				{
-					if (staticOffsets[i-1] + int32_t(step) != staticOffsets[i]) { return false; }
-				}
-				return true;
-			}
-
-			// Returns true if all offsets are equal (N, N, N, N)
-			inline rr::Bool hasEqualOffsets() const
-			{
-				if (hasDynamicOffsets)
-				{
-					auto o = offsets();
-					static_assert(SIMD::Width == 4, "Expects SIMD::Width to be 4");
-					return rr::SignMask(~CmpEQ(o, o.yzwx)) == 0;
-				}
-				return hasStaticEqualOffsets();
-			}
-
-			// Returns true if all offsets are compile-time static and are equal
-			// (N, N, N, N)
-			inline bool hasStaticEqualOffsets() const
-			{
-				if (hasDynamicOffsets)
-				{
-					return false;
-				}
-				for (int i = 1; i < SIMD::Width; i++)
-				{
-					if (staticOffsets[i-1] != staticOffsets[i]) { return false; }
-				}
-				return true;
-			}
-
-			// Base address for the pointer, common across all lanes.
-			rr::Pointer<rr::Byte> base;
-
-			// Upper (non-inclusive) limit for offsets from base.
-			rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
-			unsigned int staticLimit;
-
-			// Per lane offsets from base.
-			SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
-			std::array<int32_t, SIMD::Width> staticOffsets;
-
-			bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
-			bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
-		};
-
-		template <typename T> struct Element {};
-		template <> struct Element<Float> { using type = rr::Float; };
-		template <> struct Element<Int>   { using type = rr::Int; };
-		template <> struct Element<UInt>  { using type = rr::UInt; };
-
-		template<typename T>
-		void Store(Pointer ptr, T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
-
-		template<typename T>
-		void Store(Pointer ptr, RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed)
-		{
-			Store(ptr, T(val), robustness, mask, atomic, order);
-		}
-
-		template<typename T>
-		T Load(Pointer ptr, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
-	}
-
 	// Incrementally constructed complex bundle of rvalues
 	// Effectively a restricted vector, supporting only:
 	// - allocation to a (runtime-known) fixed size
@@ -967,7 +725,9 @@
 		//
 		static bool IsStorageInterleavedByLane(spv::StorageClass storageClass);
 		static bool IsExplicitLayout(spv::StorageClass storageClass);
-	
+
+		static sw::SIMD::Pointer InterleaveByLane(sw::SIMD::Pointer p);
+
 		// Output storage buffers and images should not be affected by helper invocations
 		static bool StoresInHelperInvocation(spv::StorageClass storageClass);