Do not indent C++ namespace contents

This is a style change. Visual Studio defaults to indenting namespace
contents, and this was adopted for a long time, but with the new Vulkan
implementation this was abandoned. However the legacy code borrowed from
the OpenGL ES implementation still used indentation so it was
inconsistent.

The justification for not indenting namespace contents is that
namespaces are merely a way to avoid name clashes with other projects
we don't control directly (and in rare cases internal subprojects when
we want to reuse the same names). Hence the vast majority of files have
a single namespace, and unlike indentation used for ease of discerning
control flow blocks, class contents, or function contents, which can
become highly nested, there is no such readability advantage to
indenting namespace contents.

This is also the Google style recommendation (though no justification or
discussion is provided):
https://google.github.io/styleguide/cppguide.html#Namespace_Formatting

One reasonable counter-argument is consistency with other blocks of
curly brackets, but considering that most namespaces span almost the
entire file, it's a substantial waste of line length.

Because there is no indentation, there's also no need to have the open
and closing brackets line up as a visual aid, like we prefer for other
uses of curly brackets. So we place the open bracket on the same line as
the namespace keyword.

A comment is added to the closing bracket to discern it from other
closing brackets. It also makes it easier to find the end of anonymous
namespaces which typically go at the top of the source file.

This change is make separately from applying clang-format because diff
tools mark all these unindented lines as changes and this makes it hard
to review the smaller style changes made by clang-format. The OpenGL ES
and Direct3D code is left untouched because it is in maintenance mode
and in case of regressions we want easy 'blame' tool usage.

Bug: b/144825072
Change-Id: Ie2925ebd697e1ffa7c4cbdc9a946531f11f4d934
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39348
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index e95d4d0..8ac2cc2 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -23,507 +23,508 @@
 #include <array>
 #include <utility> // std::pair
 
-namespace sw
+namespace sw {
+
+using namespace rr;
+
+class Vector4s
 {
-	using namespace rr;
+public:
+	Vector4s();
+	Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+	Vector4s(const Vector4s &rhs);
 
-	class Vector4s
-	{
-	public:
-		Vector4s();
-		Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
-		Vector4s(const Vector4s &rhs);
+	Short4 &operator[](int i);
+	Vector4s &operator=(const Vector4s &rhs);
 
-		Short4 &operator[](int i);
-		Vector4s &operator=(const Vector4s &rhs);
+	Short4 x;
+	Short4 y;
+	Short4 z;
+	Short4 w;
+};
 
-		Short4 x;
-		Short4 y;
-		Short4 z;
-		Short4 w;
-	};
+class Vector4f
+{
+public:
+	Vector4f();
+	Vector4f(float x, float y, float z, float w);
+	Vector4f(const Vector4f &rhs);
 
-	class Vector4f
-	{
-	public:
-		Vector4f();
-		Vector4f(float x, float y, float z, float w);
-		Vector4f(const Vector4f &rhs);
+	Float4 &operator[](int i);
+	Vector4f &operator=(const Vector4f &rhs);
 
-		Float4 &operator[](int i);
-		Vector4f &operator=(const Vector4f &rhs);
+	Float4 x;
+	Float4 y;
+	Float4 z;
+	Float4 w;
+};
 
-		Float4 x;
-		Float4 y;
-		Float4 z;
-		Float4 w;
-	};
+enum class OutOfBoundsBehavior
+{
+	Nullify,             // Loads become zero, stores are elided.
+	RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
+	UndefinedValue,      // Only for load operations. Not secure. No program termination.
+	UndefinedBehavior,   // Program may terminate.
+};
 
-	enum class OutOfBoundsBehavior
-	{
-		Nullify,             // Loads become zero, stores are elided.
-		RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
-		UndefinedValue,      // Only for load operations. Not secure. No program termination.
-		UndefinedBehavior,   // Program may terminate.
-	};
+// SIMD contains types that represent multiple scalars packed into a single
+// vector data type. Types in the SIMD namespace provide a semantic hint
+// that the data should be treated as a per-execution-lane scalar instead of
+// a typical euclidean-style vector type.
+namespace SIMD {
 
-	// SIMD contains types that represent multiple scalars packed into a single
-	// vector data type. Types in the SIMD namespace provide a semantic hint
-	// that the data should be treated as a per-execution-lane scalar instead of
-	// a typical euclidean-style vector type.
-	namespace SIMD
-	{
-		// Width is the number of per-lane scalars packed into each SIMD vector.
-		static constexpr int Width = 4;
+// Width is the number of per-lane scalars packed into each SIMD vector.
+static constexpr int Width = 4;
 
-		using Float = rr::Float4;
-		using Int = rr::Int4;
-		using UInt = rr::UInt4;
+using Float = rr::Float4;
+using Int = rr::Int4;
+using UInt = rr::UInt4;
 
-		struct Pointer
-		{
-			Pointer(rr::Pointer<Byte> base, rr::Int limit);
-			Pointer(rr::Pointer<Byte> base, unsigned int limit);
-			Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
-			Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
+struct Pointer
+{
+	Pointer(rr::Pointer<Byte> base, rr::Int limit);
+	Pointer(rr::Pointer<Byte> base, unsigned int limit);
+	Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
+	Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
 
-			Pointer& operator += (Int i);
-			Pointer& operator *= (Int i);
+	Pointer& operator += (Int i);
+	Pointer& operator *= (Int i);
 
-			Pointer operator + (SIMD::Int i);
-			Pointer operator * (SIMD::Int i);
+	Pointer operator + (SIMD::Int i);
+	Pointer operator * (SIMD::Int i);
 
-			Pointer& operator += (int i);
-			Pointer& operator *= (int i);
+	Pointer& operator += (int i);
+	Pointer& operator *= (int i);
 
-			Pointer operator + (int i);
-			Pointer operator * (int i);
+	Pointer operator + (int i);
+	Pointer operator * (int i);
 
-			SIMD::Int offsets() const;
+	SIMD::Int offsets() const;
 
-			SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
-			bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
 
-			Int limit() const;
+	Int limit() const;
 
-			// Returns true if all offsets are sequential
-			// (N+0*step, N+1*step, N+2*step, N+3*step)
-			rr::Bool hasSequentialOffsets(unsigned int step) const;
+	// Returns true if all offsets are sequential
+	// (N+0*step, N+1*step, N+2*step, N+3*step)
+	rr::Bool hasSequentialOffsets(unsigned int step) const;
 
-			// Returns true if all offsets are are compile-time static and
-			// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
-			bool hasStaticSequentialOffsets(unsigned int step) const;
+	// Returns true if all offsets are are compile-time static and
+	// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
+	bool hasStaticSequentialOffsets(unsigned int step) const;
 
-			// Returns true if all offsets are equal (N, N, N, N)
-			rr::Bool hasEqualOffsets() const;
+	// Returns true if all offsets are equal (N, N, N, N)
+	rr::Bool hasEqualOffsets() const;
 
-			// Returns true if all offsets are compile-time static and are equal
-			// (N, N, N, N)
-			bool hasStaticEqualOffsets() const;
-
-			template<typename T>
-			inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
-
-			template<typename T>
-			inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
-
-			template<typename T>
-			inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
-
-			// Base address for the pointer, common across all lanes.
-			rr::Pointer<rr::Byte> base;
-
-			// Upper (non-inclusive) limit for offsets from base.
-			rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
-			unsigned int staticLimit;
-
-			// Per lane offsets from base.
-			SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
-			std::array<int32_t, SIMD::Width> staticOffsets;
-
-			bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
-			bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
-		};
-
-		template <typename T> struct Element {};
-		template <> struct Element<Float> { using type = rr::Float; };
-		template <> struct Element<Int>   { using type = rr::Int; };
-		template <> struct Element<UInt>  { using type = rr::UInt; };
-
-	} // namespace SIMD
-
-	Float4 exponential2(RValue<Float4> x, bool pp = false);
-	Float4 logarithm2(RValue<Float4> x, bool pp = false);
-	Float4 exponential(RValue<Float4> x, bool pp = false);
-	Float4 logarithm(RValue<Float4> x, bool pp = false);
-	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
-	Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
-	Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
-	Float4 modulo(RValue<Float4> x, RValue<Float4> y);
-	Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range
-	Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range
-	Float4 sine(RValue<Float4> x, bool pp = false);
-	Float4 cosine(RValue<Float4> x, bool pp = false);
-	Float4 tangent(RValue<Float4> x, bool pp = false);
-	Float4 arccos(RValue<Float4> x, bool pp = false);
-	Float4 arcsin(RValue<Float4> x, bool pp = false);
-	Float4 arctan(RValue<Float4> x, bool pp = false);
-	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
-	Float4 sineh(RValue<Float4> x, bool pp = false);
-	Float4 cosineh(RValue<Float4> x, bool pp = false);
-	Float4 tangenth(RValue<Float4> x, bool pp = false);
-	Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
-	Float4 arcsinh(RValue<Float4> x, bool pp = false);
-	Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
-
-	Float4 dot2(const Vector4f &v0, const Vector4f &v1);
-	Float4 dot3(const Vector4f &v0, const Vector4f &v1);
-	Float4 dot4(const Vector4f &v0, const Vector4f &v1);
-
-	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
-	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
-	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
-	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
-
-	UInt4 halfToFloatBits(UInt4 halfBits);
-
-	rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
-
-	rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
-
-	template <typename T>
-	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
-
-	template <typename T>
-	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
-
-	rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
-
-	// Returns the <whole, frac> of val.
-	// Both whole and frac will have the same sign as val.
-	std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
-	Modf(rr::RValue<sw::SIMD::Float> const &val);
-
-	// Returns the number of 1s in bits, per lane.
-	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
-
-	// Returns 1 << bits.
-	// If the resulting bit overflows a 32 bit integer, 0 is returned.
-	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
-
-	// Returns bitCount number of of 1's starting from the LSB.
-	rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
-
-	// Performs a fused-multiply add, returning a * b + c.
-	rr::RValue<sw::SIMD::Float> FMA(
-			rr::RValue<sw::SIMD::Float> const &a,
-			rr::RValue<sw::SIMD::Float> const &b,
-			rr::RValue<sw::SIMD::Float> const &c);
-
-	// Returns the exponent of the floating point number f.
-	// Assumes IEEE 754
-	rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
-
-	// Returns y if y < x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
-
-	// Returns y if y > x; otherwise result is x.
-	// If one operand is a NaN, the other operand is the result.
-	// If both operands are NaN, the result is a NaN.
-	rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
-
-	// Returns the determinant of a 2x2 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
-
-	// Returns the determinant of a 3x3 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
-
-	// Returns the determinant of a 4x4 matrix.
-	rr::RValue<sw::SIMD::Float> Determinant(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
-
-	// Returns the inverse of a 2x2 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
-		rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
-
-	// Returns the inverse of a 3x3 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
-		rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
-		rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
-
-	// Returns the inverse of a 4x4 matrix.
-	std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
-		rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
-		rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
-		rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
-		rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
-
-	////////////////////////////////////////////////////////////////////////////
-	// Inline functions
-	////////////////////////////////////////////////////////////////////////////
+	// Returns true if all offsets are compile-time static and are equal
+	// (N, N, N, N)
+	bool hasStaticEqualOffsets() const;
 
 	template<typename T>
-	inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+	inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+
+	template<typename T>
+	inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+	template<typename T>
+	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+
+	// Base address for the pointer, common across all lanes.
+	rr::Pointer<rr::Byte> base;
+
+	// Upper (non-inclusive) limit for offsets from base.
+	rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
+	unsigned int staticLimit;
+
+	// Per lane offsets from base.
+	SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
+	std::array<int32_t, SIMD::Width> staticOffsets;
+
+	bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
+	bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
+};
+
+template <typename T> struct Element {};
+template <> struct Element<Float> { using type = rr::Float; };
+template <> struct Element<Int>   { using type = rr::Int; };
+template <> struct Element<UInt>  { using type = rr::UInt; };
+
+} // namespace SIMD
+
+Float4 exponential2(RValue<Float4> x, bool pp = false);
+Float4 logarithm2(RValue<Float4> x, bool pp = false);
+Float4 exponential(RValue<Float4> x, bool pp = false);
+Float4 logarithm(RValue<Float4> x, bool pp = false);
+Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
+Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
+Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
+Float4 modulo(RValue<Float4> x, RValue<Float4> y);
+Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range
+Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range
+Float4 sine(RValue<Float4> x, bool pp = false);
+Float4 cosine(RValue<Float4> x, bool pp = false);
+Float4 tangent(RValue<Float4> x, bool pp = false);
+Float4 arccos(RValue<Float4> x, bool pp = false);
+Float4 arcsin(RValue<Float4> x, bool pp = false);
+Float4 arctan(RValue<Float4> x, bool pp = false);
+Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
+Float4 sineh(RValue<Float4> x, bool pp = false);
+Float4 cosineh(RValue<Float4> x, bool pp = false);
+Float4 tangenth(RValue<Float4> x, bool pp = false);
+Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
+Float4 arcsinh(RValue<Float4> x, bool pp = false);
+Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
+
+Float4 dot2(const Vector4f &v0, const Vector4f &v1);
+Float4 dot3(const Vector4f &v0, const Vector4f &v1);
+Float4 dot4(const Vector4f &v0, const Vector4f &v1);
+
+void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+UInt4 halfToFloatBits(UInt4 halfBits);
+
+rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
+
+rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
+
+template <typename T>
+inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
+
+template <typename T>
+inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
+
+rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
+
+// Returns the <whole, frac> of val.
+// Both whole and frac will have the same sign as val.
+std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
+Modf(rr::RValue<sw::SIMD::Float> const &val);
+
+// Returns the number of 1s in bits, per lane.
+sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
+
+// Returns 1 << bits.
+// If the resulting bit overflows a 32 bit integer, 0 is returned.
+rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
+
+// Returns bitCount number of of 1's starting from the LSB.
+rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
+
+// Performs a fused-multiply add, returning a * b + c.
+rr::RValue<sw::SIMD::Float> FMA(
+		rr::RValue<sw::SIMD::Float> const &a,
+		rr::RValue<sw::SIMD::Float> const &b,
+		rr::RValue<sw::SIMD::Float> const &c);
+
+// Returns the exponent of the floating point number f.
+// Assumes IEEE 754
+rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
+
+// Returns y if y < x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+// Returns y if y > x; otherwise result is x.
+// If one operand is a NaN, the other operand is the result.
+// If both operands are NaN, the result is a NaN.
+rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
+
+// Returns the determinant of a 2x2 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+// Returns the determinant of a 3x3 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+// Returns the determinant of a 4x4 matrix.
+rr::RValue<sw::SIMD::Float> Determinant(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+// Returns the inverse of a 2x2 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
+	rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
+
+// Returns the inverse of a 3x3 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
+	rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
+	rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
+
+// Returns the inverse of a 4x4 matrix.
+std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
+	rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
+	rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
+	rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
+	rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
+
+////////////////////////////////////////////////////////////////////////////
+// Inline functions
+////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+{
+	using EL = typename Element<T>::type;
+
+	if (isStaticallyInBounds(sizeof(float), robustness))
 	{
-		using EL = typename Element<T>::type;
+		// All elements are statically known to be in-bounds.
+		// We can avoid costly conditional on masks.
 
-		if (isStaticallyInBounds(sizeof(float), robustness))
+		if (hasStaticSequentialOffsets(sizeof(float)))
 		{
-			// All elements are statically known to be in-bounds.
-			// We can avoid costly conditional on masks.
-
-			if (hasStaticSequentialOffsets(sizeof(float)))
-			{
-				// Offsets are sequential. Perform regular load.
-				return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
-			}
-			if (hasStaticEqualOffsets())
-			{
-				// Load one, replicate.
-				return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
-			}
+			// Offsets are sequential. Perform regular load.
+			return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
 		}
-		else
+		if (hasStaticEqualOffsets())
 		{
-			switch(robustness)
-			{
-			case OutOfBoundsBehavior::Nullify:
-			case OutOfBoundsBehavior::RobustBufferAccess:
-			case OutOfBoundsBehavior::UndefinedValue:
-				mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
-				break;
-			case OutOfBoundsBehavior::UndefinedBehavior:
-				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
-				break;
-			}
-		}
-
-		auto offs = offsets();
-
-		if (!atomic && order == std::memory_order_relaxed)
-		{
-			if (hasStaticEqualOffsets())
-			{
-				// Load one, replicate.
-				// Be careful of the case where the post-bounds-check mask
-				// is 0, in which case we must not load.
-				T out = T(0);
-				If(AnyTrue(mask))
-				{
-					EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
-					out = T(el);
-				}
-				return out;
-			}
-
-			bool zeroMaskedLanes = true;
-			switch(robustness)
-			{
-			case OutOfBoundsBehavior::Nullify:
-			case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
-				zeroMaskedLanes = true;
-				break;
-			case OutOfBoundsBehavior::UndefinedValue:
-			case OutOfBoundsBehavior::UndefinedBehavior:
-				zeroMaskedLanes = false;
-				break;
-			}
-
-			if (hasStaticSequentialOffsets(sizeof(float)))
-			{
-				return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
-			}
-
-			return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
-		}
-		else
-		{
-			T out;
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(hasEqualOffsets() && !anyLanesDisabled)
-			{
-				// Load one, replicate.
-				auto offset = Extract(offs, 0);
-				out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
-			}
-			Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
-			{
-				// Load all elements in a single SIMD instruction.
-				auto offset = Extract(offs, 0);
-				out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
-			}
-			Else
-			{
-				// Divergent offsets or masked lanes.
-				out = T(0);
-				for (int i = 0; i < SIMD::Width; i++)
-				{
-					If(Extract(mask, i) != 0)
-					{
-						auto offset = Extract(offs, i);
-						auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
-						out = Insert(out, el, i);
-					}
-				}
-			}
-			return out;
+			// Load one, replicate.
+			return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
 		}
 	}
-
-	template<typename T>
-	inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+	else
 	{
-		using EL = typename Element<T>::type;
-		constexpr size_t alignment = sizeof(float);
-		auto offs = offsets();
-
 		switch(robustness)
 		{
 		case OutOfBoundsBehavior::Nullify:
-		case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
-		case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
-			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+		case OutOfBoundsBehavior::RobustBufferAccess:
+		case OutOfBoundsBehavior::UndefinedValue:
+			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
 			break;
 		case OutOfBoundsBehavior::UndefinedBehavior:
 			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
 			break;
 		}
+	}
 
-		if (!atomic && order == std::memory_order_relaxed)
+	auto offs = offsets();
+
+	if (!atomic && order == std::memory_order_relaxed)
+	{
+		if (hasStaticEqualOffsets())
 		{
-			if (hasStaticEqualOffsets())
+			// Load one, replicate.
+			// Be careful of the case where the post-bounds-check mask
+			// is 0, in which case we must not load.
+			T out = T(0);
+			If(AnyTrue(mask))
 			{
-				If (AnyTrue(mask))
+				EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
+				out = T(el);
+			}
+			return out;
+		}
+
+		bool zeroMaskedLanes = true;
+		switch(robustness)
+		{
+		case OutOfBoundsBehavior::Nullify:
+		case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
+			zeroMaskedLanes = true;
+			break;
+		case OutOfBoundsBehavior::UndefinedValue:
+		case OutOfBoundsBehavior::UndefinedBehavior:
+			zeroMaskedLanes = false;
+			break;
+		}
+
+		if (hasStaticSequentialOffsets(sizeof(float)))
+		{
+			return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
+		}
+
+		return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
+	}
+	else
+	{
+		T out;
+		auto anyLanesDisabled = AnyFalse(mask);
+		If(hasEqualOffsets() && !anyLanesDisabled)
+		{
+			// Load one, replicate.
+			auto offset = Extract(offs, 0);
+			out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
+		}
+		Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+		{
+			// Load all elements in a single SIMD instruction.
+			auto offset = Extract(offs, 0);
+			out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+		}
+		Else
+		{
+			// Divergent offsets or masked lanes.
+			out = T(0);
+			for (int i = 0; i < SIMD::Width; i++)
+			{
+				If(Extract(mask, i) != 0)
 				{
-					// All equal. One of these writes will win -- elect the winning lane.
-					auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-					auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
-					auto maskedVal = As<SIMD::Int>(val) & elect;
-					auto scalarVal = Extract(maskedVal, 0) |
-						Extract(maskedVal, 1) |
-						Extract(maskedVal, 2) |
-						Extract(maskedVal, 3);
-					*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+					auto offset = Extract(offs, i);
+					auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
+					out = Insert(out, el, i);
 				}
 			}
-			else if (hasStaticSequentialOffsets(sizeof(float)))
+		}
+		return out;
+	}
+}
+
+template<typename T>
+inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+{
+	using EL = typename Element<T>::type;
+	constexpr size_t alignment = sizeof(float);
+	auto offs = offsets();
+
+	switch(robustness)
+	{
+	case OutOfBoundsBehavior::Nullify:
+	case OutOfBoundsBehavior::RobustBufferAccess:  // TODO: Allows writing anywhere within bounds. Could be faster than masking.
+	case OutOfBoundsBehavior::UndefinedValue:  // Should not be used for store operations. Treat as robust buffer access.
+		mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
+		break;
+	case OutOfBoundsBehavior::UndefinedBehavior:
+		// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
+		break;
+	}
+
+	if (!atomic && order == std::memory_order_relaxed)
+	{
+		if (hasStaticEqualOffsets())
+		{
+			If (AnyTrue(mask))
 			{
-				if (isStaticallyInBounds(sizeof(float), robustness))
-				{
-					// Pointer has no elements OOB, and the store is not atomic.
-					// Perform a RMW.
-					auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
-					auto prev = *p;
-					*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
-				}
-				else
-				{
-					rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
-				}
+				// All equal. One of these writes will win -- elect the winning lane.
+				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
+				auto maskedVal = As<SIMD::Int>(val) & elect;
+				auto scalarVal = Extract(maskedVal, 0) |
+					Extract(maskedVal, 1) |
+					Extract(maskedVal, 2) |
+					Extract(maskedVal, 3);
+				*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+			}
+		}
+		else if (hasStaticSequentialOffsets(sizeof(float)))
+		{
+			if (isStaticallyInBounds(sizeof(float), robustness))
+			{
+				// Pointer has no elements OOB, and the store is not atomic.
+				// Perform a RMW.
+				auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
+				auto prev = *p;
+				*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
 			}
 			else
 			{
-				rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
+				rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
 			}
 		}
 		else
 		{
-			auto anyLanesDisabled = AnyFalse(mask);
-			If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+			rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
+		}
+	}
+	else
+	{
+		auto anyLanesDisabled = AnyFalse(mask);
+		If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
+		{
+			// Store all elements in a single SIMD instruction.
+			auto offset = Extract(offs, 0);
+			rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
+		}
+		Else
+		{
+			// Divergent offsets or masked lanes.
+			for (int i = 0; i < SIMD::Width; i++)
 			{
-				// Store all elements in a single SIMD instruction.
-				auto offset = Extract(offs, 0);
-				rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
-			}
-			Else
-			{
-				// Divergent offsets or masked lanes.
-				for (int i = 0; i < SIMD::Width; i++)
+				If(Extract(mask, i) != 0)
 				{
-					If(Extract(mask, i) != 0)
-					{
-						auto offset = Extract(offs, i);
-						rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
-					}
+					auto offset = Extract(offs, i);
+					rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
 				}
 			}
 		}
 	}
+}
 
-	template<typename T>
-	inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
-	{
-		Store(T(val), robustness, mask, atomic, order);
-	}
+template<typename T>
+inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+{
+	Store(T(val), robustness, mask, atomic, order);
+}
 
-	template <typename T>
-	inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
+template <typename T>
+inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
+{
+	T v1 = mask;              // [x]    [y]    [z]    [w]
+	T v2 = v1.xzxz & v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+	return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+}
 
-	template <typename T>
-	inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
-	{
-		T v1 = mask;              // [x]    [y]    [z]    [w]
-		T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
-		return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
-	}
+template <typename T>
+inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
+{
+	T v1 = mask;              // [x]    [y]    [z]    [w]
+	T v2 = v1.xzxz | v1.ywyw; // [xy]   [zw]   [xy]   [zw]
+	return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw]
+}
 
 } // namespace sw
 
 #ifdef ENABLE_RR_PRINT
 namespace rr {
-	template <> struct PrintValue::Ty<sw::Vector4f>
+template <> struct PrintValue::Ty<sw::Vector4f>
+{
+	static std::string fmt(const sw::Vector4f& v)
 	{
-		static std::string fmt(const sw::Vector4f& v)
-		{
-			return "[x: " + PrintValue::fmt(v.x) + ","
-			       " y: " + PrintValue::fmt(v.y) + ","
-			       " z: " + PrintValue::fmt(v.z) + ","
-			       " w: " + PrintValue::fmt(v.w) + "]";
-		}
+		return "[x: " + PrintValue::fmt(v.x) + ","
+		       " y: " + PrintValue::fmt(v.y) + ","
+		       " z: " + PrintValue::fmt(v.z) + ","
+		       " w: " + PrintValue::fmt(v.w) + "]";
+	}
 
-		static std::vector<rr::Value*> val(const sw::Vector4f& v)
-		{
-			return PrintValue::vals(v.x, v.y, v.z, v.w);
-		}
-	};
-	template <> struct PrintValue::Ty<sw::Vector4s>
+	static std::vector<rr::Value*> val(const sw::Vector4f& v)
 	{
-		static std::string fmt(const sw::Vector4s& v)
-		{
-			return "[x: " + PrintValue::fmt(v.x) + ","
-			       " y: " + PrintValue::fmt(v.y) + ","
-			       " z: " + PrintValue::fmt(v.z) + ","
-			       " w: " + PrintValue::fmt(v.w) + "]";
-		}
+		return PrintValue::vals(v.x, v.y, v.z, v.w);
+	}
+};
+template <> struct PrintValue::Ty<sw::Vector4s>
+{
+	static std::string fmt(const sw::Vector4s& v)
+	{
+		return "[x: " + PrintValue::fmt(v.x) + ","
+		       " y: " + PrintValue::fmt(v.y) + ","
+		       " z: " + PrintValue::fmt(v.z) + ","
+		       " w: " + PrintValue::fmt(v.w) + "]";
+	}
 
-		static std::vector<rr::Value*> val(const sw::Vector4s& v)
-		{
-			return PrintValue::vals(v.x, v.y, v.z, v.w);
-		}
-	};
-}
+	static std::vector<rr::Value*> val(const sw::Vector4s& v)
+	{
+		return PrintValue::vals(v.x, v.y, v.z, v.w);
+	}
+};
+
+}  // namespace sw
 #endif // ENABLE_RR_PRINT
 
 #endif   // sw_ShaderCore_hpp