Do not indent C++ namespace contents This is a style change. Visual Studio defaults to indenting namespace contents, and this was adopted for a long time, but with the new Vulkan implementation this was abandoned. However the legacy code borrowed from the OpenGL ES implementation still used indentation so it was inconsistent. The justification for not indenting namespace contents is that namespaces are merely a way to avoid name clashes with other projects we don't control directly (and in rare cases internal subprojects when we want to reuse the same names). Hence the vast majority of files have a single namespace, and unlike indentation used for ease of discerning control flow blocks, class contents, or function contents, which can become highly nested, there is no such readability advantage to indenting namespace contents. This is also the Google style recommendation (though no justification or discussion is provided): https://google.github.io/styleguide/cppguide.html#Namespace_Formatting One reasonable counter-argument is consistency with other blocks of curly brackets, but considering that most namespaces span almost the entire file, it's a substantial waste of line length. Because there is no indentation, there's also no need to have the open and closing brackets line up as a visual aid, like we prefer for other uses of curly brackets. So we place the open bracket on the same line as the namespace keyword. A comment is added to the closing bracket to discern it from other closing brackets. It also makes it easier to find the end of anonymous namespaces which typically go at the top of the source file. This change is make separately from applying clang-format because diff tools mark all these unindented lines as changes and this makes it hard to review the smaller style changes made by clang-format. The OpenGL ES and Direct3D code is left untouched because it is in maintenance mode and in case of regressions we want easy 'blame' tool usage. Bug: b/144825072 Change-Id: Ie2925ebd697e1ffa7c4cbdc9a946531f11f4d934 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39348 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Ben Clayton <bclayton@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com>

diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index e06601b..0b4bdab 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp

@@ -24,1435 +24,1462 @@
 
 #include <utility>
 
-namespace sw
+namespace sw {
+
+Blitter::Blitter() :
+	blitMutex(),
+	blitCache(1024),
+	cornerUpdateMutex(),
+	cornerUpdateCache(64) // We only need one of these per format
 {
-	Blitter::Blitter() :
-		blitMutex(),
-		blitCache(1024),
-		cornerUpdateMutex(),
-		cornerUpdateCache(64) // We only need one of these per format
+}
+
+Blitter::~Blitter()
+{
+}
+
+void Blitter::clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+{
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
+	vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
+	if(dstFormat == VK_FORMAT_UNDEFINED)
 	{
+		return;
 	}
 
-	Blitter::~Blitter()
+	float *pPixel = static_cast<float *>(pixel);
+	if (viewFormat.isUnsignedNormalized())
 	{
+		pPixel[0] = sw::clamp(pPixel[0], 0.0f, 1.0f);
+		pPixel[1] = sw::clamp(pPixel[1], 0.0f, 1.0f);
+		pPixel[2] = sw::clamp(pPixel[2], 0.0f, 1.0f);
+		pPixel[3] = sw::clamp(pPixel[3], 0.0f, 1.0f);
+	}
+	else if (viewFormat.isSignedNormalized())
+	{
+		pPixel[0] = sw::clamp(pPixel[0], -1.0f, 1.0f);
+		pPixel[1] = sw::clamp(pPixel[1], -1.0f, 1.0f);
+		pPixel[2] = sw::clamp(pPixel[2], -1.0f, 1.0f);
+		pPixel[3] = sw::clamp(pPixel[3], -1.0f, 1.0f);
 	}
 
-	void Blitter::clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+	if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
 	{
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
-		vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
-		if(dstFormat == VK_FORMAT_UNDEFINED)
+		return;
+	}
+
+	State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	VkImageSubresourceLayers subresLayers =
+	{
+		subresourceRange.aspectMask,
+		subresourceRange.baseMipLevel,
+		subresourceRange.baseArrayLayer,
+		1
+	};
+
+	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
+	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
+
+	VkRect2D area = { { 0, 0 }, { 0, 0 } };
+	if(renderArea)
+	{
+		ASSERT(subresourceRange.levelCount == 1);
+		area = *renderArea;
+	}
+
+	for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
+	{
+		VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
+		if(!renderArea)
 		{
-			return;
+			area.extent.width = extent.width;
+			area.extent.height = extent.height;
 		}
 
-		float *pPixel = static_cast<float *>(pixel);
-		if (viewFormat.isUnsignedNormalized())
+		BlitData data =
 		{
-			pPixel[0] = sw::clamp(pPixel[0], 0.0f, 1.0f);
-			pPixel[1] = sw::clamp(pPixel[1], 0.0f, 1.0f);
-			pPixel[2] = sw::clamp(pPixel[2], 0.0f, 1.0f);
-			pPixel[3] = sw::clamp(pPixel[3], 0.0f, 1.0f);
-		}
-		else if (viewFormat.isSignedNormalized())
-		{
-			pPixel[0] = sw::clamp(pPixel[0], -1.0f, 1.0f);
-			pPixel[1] = sw::clamp(pPixel[1], -1.0f, 1.0f);
-			pPixel[2] = sw::clamp(pPixel[2], -1.0f, 1.0f);
-			pPixel[3] = sw::clamp(pPixel[3], -1.0f, 1.0f);
-		}
+			pixel, nullptr, // source, dest
 
-		if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
-		{
-			return;
-		}
+			format.bytes(),                                       // sPitchB
+			dest->rowPitchBytes(aspect, subresLayers.mipLevel),   // dPitchB
+			0,                                                    // sSliceB (unused in clear operations)
+			dest->slicePitchBytes(aspect, subresLayers.mipLevel), // dSliceB
 
-		State state(format, dstFormat, 1, dest->getSampleCountFlagBits(), Options{ 0xF });
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
+			0.5f, 0.5f, 0.0f, 0.0f, // x0, y0, w, h
 
-		VkImageSubresourceLayers subresLayers =
-		{
-			subresourceRange.aspectMask,
-			subresourceRange.baseMipLevel,
-			subresourceRange.baseArrayLayer,
-			1
+			area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
+			area.offset.x, static_cast<int>(area.offset.x + area.extent.width),  // x0d, x1d
+
+			0, 0, // sWidth, sHeight
 		};
 
-		uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
-		uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
-
-		VkRect2D area = { { 0, 0 }, { 0, 0 } };
-		if(renderArea)
+		if (renderArea && dest->is3DSlice())
 		{
-			ASSERT(subresourceRange.levelCount == 1);
-			area = *renderArea;
-		}
-
-		for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
-		{
-			VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
-			if(!renderArea)
+			// Reinterpret layers as depth slices
+			subresLayers.baseArrayLayer = 0;
+			subresLayers.layerCount = 1;
+			for (uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
 			{
-				area.extent.width = extent.width;
-				area.extent.height = extent.height;
-			}
-
-			BlitData data =
-			{
-				pixel, nullptr, // source, dest
-
-				format.bytes(),                                       // sPitchB
-				dest->rowPitchBytes(aspect, subresLayers.mipLevel),   // dPitchB
-				0,                                                    // sSliceB (unused in clear operations)
-				dest->slicePitchBytes(aspect, subresLayers.mipLevel), // dSliceB
-
-				0.5f, 0.5f, 0.0f, 0.0f, // x0, y0, w, h
-
-				area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
-				area.offset.x, static_cast<int>(area.offset.x + area.extent.width),  // x0d, x1d
-
-				0, 0, // sWidth, sHeight
-			};
-
-			if (renderArea && dest->is3DSlice())
-			{
-				// Reinterpret layers as depth slices
-				subresLayers.baseArrayLayer = 0;
-				subresLayers.layerCount = 1;
-				for (uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
-				{
-					data.dest = dest->getTexelPointer({0, 0, static_cast<int32_t>(depth)}, subresLayers);
-					blitRoutine(&data);
-				}
-			}
-			else
-			{
-				for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
-				{
-					for(uint32_t depth = 0; depth < extent.depth; depth++)
-					{
-						data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subresLayers);
-
-						blitRoutine(&data);
-					}
-				}
-			}
-		}
-	}
-
-	bool Blitter::fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
-	{
-		if(format != VK_FORMAT_R32G32B32A32_SFLOAT)
-		{
-			return false;
-		}
-
-		float *color = (float*)pixel;
-		float r = color[0];
-		float g = color[1];
-		float b = color[2];
-		float a = color[3];
-
-		uint32_t packed;
-
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
-		switch(viewFormat)
-		{
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * b + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * r + 0.5f) << 11);
-			break;
-		case VK_FORMAT_B5G6R5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * r + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * b + 0.5f) << 11);
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * b + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * r + 0.5f) << 0);
-			break;
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * r + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * b + 0.5f) << 0);
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			packed = R11G11B10F(color);
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			packed = RGB9E5(color);
-			break;
-		default:
-			return false;
-		}
-
-		VkImageSubresourceLayers subresLayers =
-		{
-			subresourceRange.aspectMask,
-			subresourceRange.baseMipLevel,
-			subresourceRange.baseArrayLayer,
-			1
-		};
-		uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
-		uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
-
-		VkRect2D area = { { 0, 0 }, { 0, 0 } };
-		if(renderArea)
-		{
-			ASSERT(subresourceRange.levelCount == 1);
-			area = *renderArea;
-		}
-
-		for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
-		{
-			int rowPitchBytes = dest->rowPitchBytes(aspect, subresLayers.mipLevel);
-			int slicePitchBytes = dest->slicePitchBytes(aspect, subresLayers.mipLevel);
-			VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
-			if(!renderArea)
-			{
-				area.extent.width = extent.width;
-				area.extent.height = extent.height;
-			}
-			if(dest->is3DSlice())
-			{
-				extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
-			}
-
-			for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
-			{
-				for(uint32_t depth = 0; depth < extent.depth; depth++)
-				{
-					uint8_t *slice = (uint8_t*)dest->getTexelPointer(
-						{ area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subresLayers);
-
-					for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
-					{
-						uint8_t *d = slice;
-
-						switch(viewFormat.bytes())
-						{
-						case 2:
-							for(uint32_t i = 0; i < area.extent.height; i++)
-							{
-								ASSERT(d < dest->end());
-								sw::clear((uint16_t*)d, static_cast<uint16_t>(packed), area.extent.width);
-								d += rowPitchBytes;
-							}
-							break;
-						case 4:
-							for(uint32_t i = 0; i < area.extent.height; i++)
-							{
-								ASSERT(d < dest->end());
-								sw::clear((uint32_t*)d, packed, area.extent.width);
-								d += rowPitchBytes;
-							}
-							break;
-						default:
-							assert(false);
-						}
-
-						slice += slicePitchBytes;
-					}
-				}
-			}
-		}
-
-		return true;
-	}
-
-	Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
-	{
-		Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
-
-		switch(state.sourceFormat)
-		{
-		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-			c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
-			c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
-			c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
-			c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
-			break;
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SNORM:
-			c.x = Float(Int(*Pointer<SByte>(element)));
-			c.w = float(0x7F);
-			break;
-		case VK_FORMAT_R8_UNORM:
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_SRGB:
-			c.x = Float(Int(*Pointer<Byte>(element)));
-			c.w = float(0xFF);
-			break;
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SNORM:
-			c.x = Float(Int(*Pointer<Short>(element)));
-			c.w = float(0x7FFF);
-			break;
-		case VK_FORMAT_R16_UNORM:
-		case VK_FORMAT_R16_UINT:
-			c.x = Float(Int(*Pointer<UShort>(element)));
-			c.w = float(0xFFFF);
-			break;
-		case VK_FORMAT_R32_SINT:
-			c.x = Float(*Pointer<Int>(element));
-			c.w = float(0x7FFFFFFF);
-			break;
-		case VK_FORMAT_R32_UINT:
-			c.x = Float(*Pointer<UInt>(element));
-			c.w = float(0xFFFFFFFF);
-			break;
-		case VK_FORMAT_B8G8R8A8_SRGB:
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			c = Float4(*Pointer<Byte4>(element)).zyxw;
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_SNORM:
-			c = Float4(*Pointer<SByte4>(element));
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-			c = Float4(*Pointer<Byte4>(element));
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-			c = Float4(*Pointer<Short4>(element));
-			break;
-		case VK_FORMAT_R16G16B16A16_UNORM:
-		case VK_FORMAT_R16G16B16A16_UINT:
-			c = Float4(*Pointer<UShort4>(element));
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			c = Float4(*Pointer<Int4>(element));
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			c = Float4(*Pointer<UInt4>(element));
-			break;
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SNORM:
-			c.x = Float(Int(*Pointer<SByte>(element + 0)));
-			c.y = Float(Int(*Pointer<SByte>(element + 1)));
-			c.w = float(0x7F);
-			break;
-		case VK_FORMAT_R8G8_UNORM:
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_SRGB:
-			c.x = Float(Int(*Pointer<Byte>(element + 0)));
-			c.y = Float(Int(*Pointer<Byte>(element + 1)));
-			c.w = float(0xFF);
-			break;
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SNORM:
-			c.x = Float(Int(*Pointer<Short>(element + 0)));
-			c.y = Float(Int(*Pointer<Short>(element + 2)));
-			c.w = float(0x7FFF);
-			break;
-		case VK_FORMAT_R16G16_UNORM:
-		case VK_FORMAT_R16G16_UINT:
-			c.x = Float(Int(*Pointer<UShort>(element + 0)));
-			c.y = Float(Int(*Pointer<UShort>(element + 2)));
-			c.w = float(0xFFFF);
-			break;
-		case VK_FORMAT_R32G32_SINT:
-			c.x = Float(*Pointer<Int>(element + 0));
-			c.y = Float(*Pointer<Int>(element + 4));
-			c.w = float(0x7FFFFFFF);
-			break;
-		case VK_FORMAT_R32G32_UINT:
-			c.x = Float(*Pointer<UInt>(element + 0));
-			c.y = Float(*Pointer<UInt>(element + 4));
-			c.w = float(0xFFFFFFFF);
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-			c = *Pointer<Float4>(element);
-			break;
-		case VK_FORMAT_R32G32_SFLOAT:
-			c.x = *Pointer<Float>(element + 0);
-			c.y = *Pointer<Float>(element + 4);
-			break;
-		case VK_FORMAT_R32_SFLOAT:
-			c.x = *Pointer<Float>(element);
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			c.w = Float(*Pointer<Half>(element + 6));
-		case VK_FORMAT_R16G16B16_SFLOAT:
-			c.z = Float(*Pointer<Half>(element + 4));
-		case VK_FORMAT_R16G16_SFLOAT:
-			c.y = Float(*Pointer<Half>(element + 2));
-		case VK_FORMAT_R16_SFLOAT:
-			c.x = Float(*Pointer<Half>(element));
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-			// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
-			// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
-			// In this case, we have:
-			//              B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
-			// 1st Short:                                  |xxxxxxxxxx---------------------|
-			// 2nd Short:                  |xxxx---------------------xxxxxx|
-			// 3rd Short: |--------------------xxxxxxxxxxxx|
-			// These memory reads overlap, but each of them contains an entire channel, so we can read this without
-			// any int -> short conversion.
-			c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
-			c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
-			c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
-			c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));         // R's mantissa (bits 0-8)
-			c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);  // G's mantissa (bits 9-17)
-			c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
-			c *= Float4(
-				// 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
-				Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
-				// Since the 9 bit mantissa values currently stored in RGB were converted straight
-				// from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
-				// are (1 << 9) times too high.
-				// Also, the exponent has 5 bits and we compute the exponent bias of floating point
-				// formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
-				// Exponent bias (15) + number of mantissa bits per component (9) = 24
-				Float(1.0f / (1 << 24)));
-			c.w = 1.0f;
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
-			c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
-			c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
-			break;
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
-			c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
-			c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
-			c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
-			c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
-			c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
-			c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
-			break;
-		case VK_FORMAT_D16_UNORM:
-			c.x = Float(Int((*Pointer<UShort>(element))));
-			break;
-		case VK_FORMAT_X8_D24_UNORM_PACK32:
-			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
-			break;
-		case VK_FORMAT_D32_SFLOAT:
-			c.x = *Pointer<Float>(element);
-			break;
-		case VK_FORMAT_S8_UINT:
-			c.x = Float(Int(*Pointer<Byte>(element)));
-			break;
-		default:
-			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
-		}
-
-		return c;
-	}
-
-	void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
-	{
-		bool writeR = state.writeRed;
-		bool writeG = state.writeGreen;
-		bool writeB = state.writeBlue;
-		bool writeA = state.writeAlpha;
-		bool writeRGBA = writeR && writeG && writeB && writeA;
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_R4G4_UNORM_PACK8:
-			if(writeR | writeG)
-			{
-				if(!writeR)
-				{
-					*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
-				                              (*Pointer<Byte>(element) & Byte(0xF0));
-				}
-				else if(!writeG)
-				{
-					*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
-				                              (Byte(RoundInt(Float(c.x))) << Byte(4));
-				}
-				else
-				{
-					*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
-				                              (Byte(RoundInt(Float(c.x))) << Byte(4));
-				}
-			}
-			break;
-		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
-			if(writeR || writeG || writeB || writeA)
-			{
-				*Pointer<UShort>(element) = (writeR ? ((UShort(RoundInt(Float(c.x))) & UShort(0xF)) << UShort(12)) :
-				                                      (*Pointer<UShort>(element) & UShort(0x000F))) |
-				                            (writeG ? ((UShort(RoundInt(Float(c.y))) & UShort(0xF)) << UShort(8)) :
-				                                      (*Pointer<UShort>(element) & UShort(0x00F0))) |
-				                            (writeB ? ((UShort(RoundInt(Float(c.z))) & UShort(0xF)) << UShort(4)) :
-			                                          (*Pointer<UShort>(element) & UShort(0x0F00))) |
-			                                (writeA ? (UShort(RoundInt(Float(c.w))) & UShort(0xF)) :
-			                                          (*Pointer<UShort>(element) & UShort(0xF000)));
-			}
-			break;
-		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) & Int(0xF)) |
-				                            UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
-				                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
-				                            UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12);
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x000F : 0x0000) |
-				                      (writeR ? 0x00F0 : 0x0000) |
-				                      (writeG ? 0x0F00 : 0x0000) |
-				                      (writeB ? 0xF000 : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            ((UShort(RoundInt(Float(c.w)) & Int(0xF)) |
-				                              UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
-				                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
-				                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12)) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_SRGB:
-		case VK_FORMAT_B8G8R8A8_UNORM:
-			if(writeRGBA)
-			{
-				Short4 c0 = RoundShort4(c.zyxw);
-				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
-			}
-			else
-			{
-				if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
-				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-				if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
-				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_B8G8R8_SNORM:
-			if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_B8G8R8_UNORM:
-		case VK_FORMAT_B8G8R8_SRGB:
-			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_UNORM:
-		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		case VK_FORMAT_R8G8B8A8_SRGB:
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
-			if(writeRGBA)
-			{
-				Short4 c0 = RoundShort4(c);
-				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32A32_SFLOAT:
-			if(writeRGBA)
-			{
-				*Pointer<Float4>(element) = c;
-			}
-			else
-			{
-				if(writeR) { *Pointer<Float>(element) = c.x; }
-				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-				if(writeB) { *Pointer<Float>(element + 8) = c.z; }
-				if(writeA) { *Pointer<Float>(element + 12) = c.w; }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SFLOAT:
-			if(writeR) { *Pointer<Float>(element) = c.x; }
-			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
-			break;
-		case VK_FORMAT_R32G32_SFLOAT:
-			if(writeR && writeG)
-			{
-				*Pointer<Float2>(element) = Float2(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<Float>(element) = c.x; }
-				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
-			}
-			break;
-		case VK_FORMAT_R32_SFLOAT:
-			if(writeR) { *Pointer<Float>(element) = c.x; }
-			break;
-		case VK_FORMAT_R16G16B16A16_SFLOAT:
-			if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
-		case VK_FORMAT_R16G16B16_SFLOAT:
-			if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
-		case VK_FORMAT_R16G16_SFLOAT:
-			if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
-		case VK_FORMAT_R16_SFLOAT:
-			if(writeR) { *Pointer<Half>(element) = Half(c.x); }
-			break;
-		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			{
-				// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
-				// Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
-
-				// FIXME(b/138944025): Handle negative values, Inf, and NaN.
-				// FIXME(b/138944025): Perform rounding before truncating the mantissa.
-				UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
-				UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
-				UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
-
-				UInt rgb = r | g | b;
-
-				UInt old = *Pointer<UInt>(element);
-
-				unsigned int mask = (writeR ? 0x000007FF : 0) |
-				                    (writeG ? 0x003FF800 : 0) |
-				                    (writeB ? 0xFFC00000 : 0);
-
-				*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
-			}
-			break;
-		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			{
-				ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
-
-				// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
-
-				constexpr int N = 9;       // number of mantissa bits per component
-				constexpr int B = 15;      // exponent bias
-				constexpr int E_max = 31;  // maximum possible biased exponent value
-
-				// Maximum representable value.
-				constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
-
-				// Clamp components to valid range. NaN becomes 0.
-				Float red_c =   Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
-				Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
-				Float blue_c =  Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
-
-				// We're reducing the mantissa to 9 bits, so we must round up if the next
-				// bit is 1. In other words add 0.5 to the new mantissa's position and
-				// allow overflow into the exponent so we can scale correctly.
-				constexpr int half = 1 << (23 - N);
-				Float red_r = As<Float>(As<Int>(red_c) + half);
-				Float green_r = As<Float>(As<Int>(green_c) + half);
-				Float blue_r = As<Float>(As<Int>(blue_c) + half);
-
-				// The largest component determines the shared exponent. It can't be lower
-				// than 0 (after bias subtraction) so also limit to the mimimum representable.
-				constexpr float min_s = 0.5f / (1 << B);
-				Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
-
-				// Obtain the reciprocal of the shared exponent by inverting the bits,
-				// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
-				// format has an implicit leading 1, but this shared component format does not.
-				Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
-
-				UInt R9 = RoundInt(red_c * scale);
-				UInt G9 = UInt(RoundInt(green_c * scale));
-				UInt B9 = UInt(RoundInt(blue_c * scale));
-				UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
-
-				UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
-
-				*Pointer<UInt>(element) = E5B9G9R9;
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_SNORM:
-			if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-		case VK_FORMAT_R8G8B8A8_SNORM:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
-		case VK_FORMAT_R8G8B8_SINT:
-		case VK_FORMAT_R8G8B8_SNORM:
-		case VK_FORMAT_R8G8B8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SNORM:
-		case VK_FORMAT_R8G8_SSCALED:
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SNORM:
-		case VK_FORMAT_R8_SSCALED:
-			if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8B8_UNORM:
-		case VK_FORMAT_R8G8B8_USCALED:
-		case VK_FORMAT_R8G8B8_SRGB:
-			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_UNORM:
-		case VK_FORMAT_R8G8_USCALED:
-		case VK_FORMAT_R8G8_SRGB:
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_UNORM:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_R8_SRGB:
-			if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_SNORM:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-			if(writeRGBA)
-			{
-				*Pointer<Short4>(element) = Short4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16B16_SNORM:
-		case VK_FORMAT_R16G16B16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
-			break;
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SNORM:
-		case VK_FORMAT_R16G16_SSCALED:
-			if(writeR && writeG)
-			{
-				*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
-			}
-			else
-			{
-				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
-			}
-			break;
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SNORM:
-		case VK_FORMAT_R16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16A16_UNORM:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-			if(writeRGBA)
-			{
-				*Pointer<UShort4>(element) = UShort4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16B16_UNORM:
-		case VK_FORMAT_R16G16B16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
-			break;
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16G16_UNORM:
-		case VK_FORMAT_R16G16_USCALED:
-			if(writeR && writeG)
-			{
-				*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
-			}
-			break;
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16_UNORM:
-		case VK_FORMAT_R16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			if(writeRGBA)
-			{
-				*Pointer<Int4>(element) = RoundInt(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
-				if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
-				if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
-				if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SINT:
-			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
-		case VK_FORMAT_R32G32_SINT:
-			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
-		case VK_FORMAT_R32_SINT:
-			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			if(writeRGBA)
-			{
-				*Pointer<UInt4>(element) = UInt4(RoundInt(c));
-			}
-			else
-			{
-				if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
-				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
-				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
-				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_UINT:
-			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
-		case VK_FORMAT_R32G32_UINT:
-			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
-		case VK_FORMAT_R32_UINT:
-			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
-			break;
-		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			if(writeR && writeG && writeB)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                  (RoundInt(Float(c.y)) << Int(5)) |
-				                                  (RoundInt(Float(c.x)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                  (RoundInt(Float(c.z)) << Int(1)) |
-				                                  (RoundInt(Float(c.y)) << Int(6)) |
-				                                  (RoundInt(Float(c.x)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.z)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
-				                                  (RoundInt(Float(c.x)) << Int(1)) |
-				                                  (RoundInt(Float(c.y)) << Int(6)) |
-				                                  (RoundInt(Float(c.z)) << Int(11)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.w)) |
-				                                   (RoundInt(Float(c.x)) << Int(1)) |
-				                                   (RoundInt(Float(c.y)) << Int(6)) |
-				                                   (RoundInt(Float(c.z)) << Int(11))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
-			if(writeRGBA)
-			{
-				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
-				                                  (RoundInt(Float(c.y)) << Int(5)) |
-				                                  (RoundInt(Float(c.x)) << Int(10)) |
-				                                  (RoundInt(Float(c.w)) << Int(15)));
-			}
-			else
-			{
-				unsigned short mask = (writeA ? 0x8000 : 0x0000) |
-				                      (writeR ? 0x7C00 : 0x0000) |
-				                      (writeG ? 0x03E0 : 0x0000) |
-				                      (writeB ? 0x001F : 0x0000);
-				unsigned short unmask = ~mask;
-				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
-				                            (UShort(RoundInt(Float(c.z)) |
-				                                   (RoundInt(Float(c.y)) << Int(5)) |
-				                                   (RoundInt(Float(c.x)) << Int(10)) |
-				                                   (RoundInt(Float(c.w)) << Int(15))) & UShort(mask));
-			}
-			break;
-		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
-				                              (RoundInt(Float(c.y)) << 10) |
-				                              (RoundInt(Float(c.z)) << 20) |
-				                              (RoundInt(Float(c.w)) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeB ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeR ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                            (UInt(RoundInt(Float(c.x)) |
-				                                 (RoundInt(Float(c.y)) << 10) |
-				                                 (RoundInt(Float(c.z)) << 20) |
-				                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
-		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.z)) |
-				                              (RoundInt(Float(c.y)) << 10) |
-				                              (RoundInt(Float(c.x)) << 20) |
-				                              (RoundInt(Float(c.w)) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeR ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeB ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-				                            (UInt(RoundInt(Float(c.z)) |
-				                                 (RoundInt(Float(c.y)) << 10) |
-				                                 (RoundInt(Float(c.x)) << 20) |
-				                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_D16_UNORM:
-			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
-			break;
-		case VK_FORMAT_X8_D24_UNORM_PACK32:
-			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
-			break;
-		case VK_FORMAT_D32_SFLOAT:
-			*Pointer<Float>(element) = c.x;
-			break;
-		case VK_FORMAT_S8_UINT:
-			*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
-			break;
-		default:
-			UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
-			break;
-		}
-	}
-
-	Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
-	{
-		Int4 c(0, 0, 0, 1);
-
-		switch(state.sourceFormat)
-		{
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
-			c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
-		case VK_FORMAT_R8G8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
-		case VK_FORMAT_R8_SINT:
-			c = Insert(c, Int(*Pointer<SByte>(element)), 0);
-			break;
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
-			c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
-			c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
-		case VK_FORMAT_R8G8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_S8_UINT:
-			c = Insert(c, Int(*Pointer<Byte>(element)), 0);
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
-			c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
-		case VK_FORMAT_R16G16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
-		case VK_FORMAT_R16_SINT:
-			c = Insert(c, Int(*Pointer<Short>(element)), 0);
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
-			c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
-		case VK_FORMAT_R16G16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
-		case VK_FORMAT_R16_UINT:
-			c = Insert(c, Int(*Pointer<UShort>(element)), 0);
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-		case VK_FORMAT_R32G32B32A32_UINT:
-			c = *Pointer<Int4>(element);
-			break;
-		case VK_FORMAT_R32G32_SINT:
-		case VK_FORMAT_R32G32_UINT:
-			c = Insert(c, *Pointer<Int>(element + 4), 1);
-		case VK_FORMAT_R32_SINT:
-		case VK_FORMAT_R32_UINT:
-			c = Insert(c, *Pointer<Int>(element), 0);
-			break;
-		default:
-			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
-		}
-
-		return c;
-	}
-
-	void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
-	{
-		bool writeR = state.writeRed;
-		bool writeG = state.writeGreen;
-		bool writeB = state.writeBlue;
-		bool writeA = state.writeAlpha;
-		bool writeRGBA = writeR && writeG && writeB && writeA;
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-			c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_R8G8B8_USCALED:
-		case VK_FORMAT_R8G8_USCALED:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_S8_UINT:
-			c = Min(As<UInt4>(c), UInt4(0xFF));
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-		case VK_FORMAT_R16G16B16_USCALED:
-		case VK_FORMAT_R16G16_USCALED:
-		case VK_FORMAT_R16_USCALED:
-			c = Min(As<UInt4>(c), UInt4(0xFFFF));
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_R8G8B8_SSCALED:
-		case VK_FORMAT_R8G8_SSCALED:
-		case VK_FORMAT_R8_SSCALED:
-			c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-		case VK_FORMAT_R16G16B16_SSCALED:
-		case VK_FORMAT_R16G16_SSCALED:
-		case VK_FORMAT_R16_SSCALED:
-			c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
-			break;
-		default:
-			break;
-		}
-
-		switch(state.destFormat)
-		{
-		case VK_FORMAT_B8G8R8A8_SINT:
-		case VK_FORMAT_B8G8R8A8_SSCALED:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
-		case VK_FORMAT_B8G8R8_SINT:
-		case VK_FORMAT_B8G8R8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
-			if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_SINT:
-		case VK_FORMAT_R8G8B8A8_SSCALED:
-		case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
-			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
-		case VK_FORMAT_R8G8B8_SINT:
-		case VK_FORMAT_R8G8B8_SSCALED:
-			if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
-		case VK_FORMAT_R8G8_SINT:
-		case VK_FORMAT_R8G8_SSCALED:
-			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
-		case VK_FORMAT_R8_SINT:
-		case VK_FORMAT_R8_SSCALED:
-			if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_SINT_PACK32:
-		case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
-		case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) =
-					UInt((Extract(c, 0)) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeB ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeR ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-					(UInt(Extract(c, 0) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_SINT_PACK32:
-		case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
-		case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
-			if(writeRGBA)
-			{
-				*Pointer<UInt>(element) =
-					UInt((Extract(c, 2)) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30));
-			}
-			else
-			{
-				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
-				                    (writeR ? 0x3FF00000 : 0x0000) |
-				                    (writeG ? 0x000FFC00 : 0x0000) |
-				                    (writeB ? 0x000003FF : 0x0000);
-				unsigned int unmask = ~mask;
-				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
-					(UInt(Extract(c, 2) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
-			}
-			break;
-		case VK_FORMAT_B8G8R8A8_UINT:
-		case VK_FORMAT_B8G8R8A8_USCALED:
-			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
-		case VK_FORMAT_B8G8R8_UINT:
-		case VK_FORMAT_B8G8R8_USCALED:
-		case VK_FORMAT_B8G8R8_SRGB:
-			if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
-			if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		case VK_FORMAT_R8G8B8A8_UINT:
-		case VK_FORMAT_R8G8B8A8_USCALED:
-		case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
-			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
-		case VK_FORMAT_R8G8B8_UINT:
-		case VK_FORMAT_R8G8B8_USCALED:
-			if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
-		case VK_FORMAT_R8G8_UINT:
-		case VK_FORMAT_R8G8_USCALED:
-			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
-		case VK_FORMAT_R8_UINT:
-		case VK_FORMAT_R8_USCALED:
-		case VK_FORMAT_S8_UINT:
-			if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R16G16B16A16_SINT:
-		case VK_FORMAT_R16G16B16A16_SSCALED:
-			if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
-		case VK_FORMAT_R16G16B16_SINT:
-		case VK_FORMAT_R16G16B16_SSCALED:
-			if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
-		case VK_FORMAT_R16G16_SINT:
-		case VK_FORMAT_R16G16_SSCALED:
-			if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
-		case VK_FORMAT_R16_SINT:
-		case VK_FORMAT_R16_SSCALED:
-			if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R16G16B16A16_UINT:
-		case VK_FORMAT_R16G16B16A16_USCALED:
-			if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
-		case VK_FORMAT_R16G16B16_UINT:
-		case VK_FORMAT_R16G16B16_USCALED:
-			if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
-		case VK_FORMAT_R16G16_UINT:
-		case VK_FORMAT_R16G16_USCALED:
-			if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
-		case VK_FORMAT_R16_UINT:
-		case VK_FORMAT_R16_USCALED:
-			if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
-			break;
-		case VK_FORMAT_R32G32B32A32_SINT:
-			if(writeRGBA)
-			{
-				*Pointer<Int4>(element) = c;
-			}
-			else
-			{
-				if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-				if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-				if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
-				if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
-			break;
-		case VK_FORMAT_R32G32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
-			break;
-		case VK_FORMAT_R32_SINT:
-			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
-			break;
-		case VK_FORMAT_R32G32B32A32_UINT:
-			if(writeRGBA)
-			{
-				*Pointer<UInt4>(element) = As<UInt4>(c);
-			}
-			else
-			{
-				if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
-				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
-				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
-				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
-			}
-			break;
-		case VK_FORMAT_R32G32B32_UINT:
-			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
-		case VK_FORMAT_R32G32_UINT:
-			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
-		case VK_FORMAT_R32_UINT:
-			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
-			break;
-		default:
-			UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
-		}
-	}
-
-	void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
-	{
-		float4 scale{}, unscale{};
-
-		if(state.clearOperation &&
-		   state.sourceFormat.isNonNormalizedInteger() &&
-		   !state.destFormat.isNonNormalizedInteger())
-		{
-			// If we're clearing a buffer from an int or uint color into a normalized color,
-			// then the whole range of the int or uint color must be scaled between 0 and 1.
-			switch(state.sourceFormat)
-			{
-			case VK_FORMAT_R32G32B32A32_SINT:
-				unscale = replicate(static_cast<float>(0x7FFFFFFF));
-				break;
-			case VK_FORMAT_R32G32B32A32_UINT:
-				unscale = replicate(static_cast<float>(0xFFFFFFFF));
-				break;
-			default:
-				UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+				data.dest = dest->getTexelPointer({0, 0, static_cast<int32_t>(depth)}, subresLayers);
+				blitRoutine(&data);
 			}
 		}
 		else
 		{
-			unscale = state.sourceFormat.getScale();
-		}
-
-		scale = state.destFormat.getScale();
-
-		bool srcSRGB = state.sourceFormat.isSRGBformat();
-		bool dstSRGB = state.destFormat.isSRGBformat();
-
-		if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))   // One of the formats is sRGB encoded.
-		{
-			value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
-			                     Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
-			value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
-			value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
-		}
-		else if(unscale != scale)
-		{
-			value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
-		}
-
-		if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
-		{
-			value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
-
-			value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
-			                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
-			                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
-			                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
-		}
-	}
-
-	Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
-	{
-		return y * pitchB + x * bytes;
-	}
-
-	Float4 Blitter::LinearToSRGB(Float4 &c)
-	{
-		Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
-		Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
-
-		Float4 s = c;
-		s.xyz = Max(lc, ec);
-
-		return s;
-	}
-
-	Float4 Blitter::sRGBtoLinear(Float4 &c)
-	{
-		Float4 lc = c * Float4(1.0f / 12.92f);
-		Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
-
-		Int4 linear = CmpLT(c, Float4(0.04045f));
-
-		Float4 s = c;
-		s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));   // TODO: IfThenElse()
-
-		return s;
-	}
-
-	Blitter::BlitRoutineType Blitter::generate(const State &state)
-	{
-		BlitFunction function;
-		{
-			Pointer<Byte> blit(function.Arg<0>());
-
-			Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
-			Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
-			Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
-			Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
-
-			Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
-			Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
-			Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
-			Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
-
-			Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
-			Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
-			Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
-			Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
-
-			Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
-			Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
-
-			bool intSrc = state.sourceFormat.isNonNormalizedInteger();
-			bool intDst = state.destFormat.isNonNormalizedInteger();
-			bool intBoth = intSrc && intDst;
-			int srcBytes = state.sourceFormat.bytes();
-			int dstBytes = state.destFormat.bytes();
-
-			bool hasConstantColorI = false;
-			Int4 constantColorI;
-			bool hasConstantColorF = false;
-			Float4 constantColorF;
-			if(state.clearOperation)
+			for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
 			{
-				if(intBoth) // Integer types
+				for(uint32_t depth = 0; depth < extent.depth; depth++)
 				{
-					constantColorI = readInt4(source, state);
-					hasConstantColorI = true;
+					data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subresLayers);
+
+					blitRoutine(&data);
+				}
+			}
+		}
+	}
+}
+
+bool Blitter::fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea)
+{
+	if(format != VK_FORMAT_R32G32B32A32_SFLOAT)
+	{
+		return false;
+	}
+
+	float *color = (float*)pixel;
+	float r = color[0];
+	float g = color[1];
+	float b = color[2];
+	float a = color[3];
+
+	uint32_t packed;
+
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
+	switch(viewFormat)
+	{
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+			        ((uint16_t)(63 * g + 0.5f) << 5) |
+			        ((uint16_t)(31 * r + 0.5f) << 11);
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		packed = ((uint16_t)(31 * r + 0.5f) << 0) |
+			        ((uint16_t)(63 * g + 0.5f) << 5) |
+			        ((uint16_t)(31 * b + 0.5f) << 11);
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+		packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+		         ((uint32_t)(255 * b + 0.5f) << 16) |
+		         ((uint32_t)(255 * g + 0.5f) << 8) |
+		         ((uint32_t)(255 * r + 0.5f) << 0);
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+		         ((uint32_t)(255 * r + 0.5f) << 16) |
+		         ((uint32_t)(255 * g + 0.5f) << 8) |
+		         ((uint32_t)(255 * b + 0.5f) << 0);
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		packed = R11G11B10F(color);
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		packed = RGB9E5(color);
+		break;
+	default:
+		return false;
+	}
+
+	VkImageSubresourceLayers subresLayers =
+	{
+		subresourceRange.aspectMask,
+		subresourceRange.baseMipLevel,
+		subresourceRange.baseArrayLayer,
+		1
+	};
+	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
+	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
+
+	VkRect2D area = { { 0, 0 }, { 0, 0 } };
+	if(renderArea)
+	{
+		ASSERT(subresourceRange.levelCount == 1);
+		area = *renderArea;
+	}
+
+	for(; subresLayers.mipLevel <= lastMipLevel; subresLayers.mipLevel++)
+	{
+		int rowPitchBytes = dest->rowPitchBytes(aspect, subresLayers.mipLevel);
+		int slicePitchBytes = dest->slicePitchBytes(aspect, subresLayers.mipLevel);
+		VkExtent3D extent = dest->getMipLevelExtent(aspect, subresLayers.mipLevel);
+		if(!renderArea)
+		{
+			area.extent.width = extent.width;
+			area.extent.height = extent.height;
+		}
+		if(dest->is3DSlice())
+		{
+			extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
+		}
+
+		for(subresLayers.baseArrayLayer = subresourceRange.baseArrayLayer; subresLayers.baseArrayLayer <= lastLayer; subresLayers.baseArrayLayer++)
+		{
+			for(uint32_t depth = 0; depth < extent.depth; depth++)
+			{
+				uint8_t *slice = (uint8_t*)dest->getTexelPointer(
+					{ area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subresLayers);
+
+				for(int j = 0; j < dest->getSampleCountFlagBits(); j++)
+				{
+					uint8_t *d = slice;
+
+					switch(viewFormat.bytes())
+					{
+					case 2:
+						for(uint32_t i = 0; i < area.extent.height; i++)
+						{
+							ASSERT(d < dest->end());
+							sw::clear((uint16_t*)d, static_cast<uint16_t>(packed), area.extent.width);
+							d += rowPitchBytes;
+						}
+						break;
+					case 4:
+						for(uint32_t i = 0; i < area.extent.height; i++)
+						{
+							ASSERT(d < dest->end());
+							sw::clear((uint32_t*)d, packed, area.extent.width);
+							d += rowPitchBytes;
+						}
+						break;
+					default:
+						assert(false);
+					}
+
+					slice += slicePitchBytes;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
+{
+	Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
+
+	switch(state.sourceFormat)
+	{
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
+		c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
+		c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
+		c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
+		break;
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SNORM:
+		c.x = Float(Int(*Pointer<SByte>(element)));
+		c.w = float(0x7F);
+		break;
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_SRGB:
+		c.x = Float(Int(*Pointer<Byte>(element)));
+		c.w = float(0xFF);
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SNORM:
+		c.x = Float(Int(*Pointer<Short>(element)));
+		c.w = float(0x7FFF);
+		break;
+	case VK_FORMAT_R16_UNORM:
+	case VK_FORMAT_R16_UINT:
+		c.x = Float(Int(*Pointer<UShort>(element)));
+		c.w = float(0xFFFF);
+		break;
+	case VK_FORMAT_R32_SINT:
+		c.x = Float(*Pointer<Int>(element));
+		c.w = float(0x7FFFFFFF);
+		break;
+	case VK_FORMAT_R32_UINT:
+		c.x = Float(*Pointer<UInt>(element));
+		c.w = float(0xFFFFFFFF);
+		break;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		c = Float4(*Pointer<Byte4>(element)).zyxw;
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_SNORM:
+		c = Float4(*Pointer<SByte4>(element));
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+		c = Float4(*Pointer<Byte4>(element));
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+		c = Float4(*Pointer<Short4>(element));
+		break;
+	case VK_FORMAT_R16G16B16A16_UNORM:
+	case VK_FORMAT_R16G16B16A16_UINT:
+		c = Float4(*Pointer<UShort4>(element));
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		c = Float4(*Pointer<Int4>(element));
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		c = Float4(*Pointer<UInt4>(element));
+		break;
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SNORM:
+		c.x = Float(Int(*Pointer<SByte>(element + 0)));
+		c.y = Float(Int(*Pointer<SByte>(element + 1)));
+		c.w = float(0x7F);
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_SRGB:
+		c.x = Float(Int(*Pointer<Byte>(element + 0)));
+		c.y = Float(Int(*Pointer<Byte>(element + 1)));
+		c.w = float(0xFF);
+		break;
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SNORM:
+		c.x = Float(Int(*Pointer<Short>(element + 0)));
+		c.y = Float(Int(*Pointer<Short>(element + 2)));
+		c.w = float(0x7FFF);
+		break;
+	case VK_FORMAT_R16G16_UNORM:
+	case VK_FORMAT_R16G16_UINT:
+		c.x = Float(Int(*Pointer<UShort>(element + 0)));
+		c.y = Float(Int(*Pointer<UShort>(element + 2)));
+		c.w = float(0xFFFF);
+		break;
+	case VK_FORMAT_R32G32_SINT:
+		c.x = Float(*Pointer<Int>(element + 0));
+		c.y = Float(*Pointer<Int>(element + 4));
+		c.w = float(0x7FFFFFFF);
+		break;
+	case VK_FORMAT_R32G32_UINT:
+		c.x = Float(*Pointer<UInt>(element + 0));
+		c.y = Float(*Pointer<UInt>(element + 4));
+		c.w = float(0xFFFFFFFF);
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+		c = *Pointer<Float4>(element);
+		break;
+	case VK_FORMAT_R32G32_SFLOAT:
+		c.x = *Pointer<Float>(element + 0);
+		c.y = *Pointer<Float>(element + 4);
+		break;
+	case VK_FORMAT_R32_SFLOAT:
+		c.x = *Pointer<Float>(element);
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		c.w = Float(*Pointer<Half>(element + 6));
+	case VK_FORMAT_R16G16B16_SFLOAT:
+		c.z = Float(*Pointer<Half>(element + 4));
+	case VK_FORMAT_R16G16_SFLOAT:
+		c.y = Float(*Pointer<Half>(element + 2));
+	case VK_FORMAT_R16_SFLOAT:
+		c.x = Float(*Pointer<Half>(element));
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+		// Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
+		// copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
+		// In this case, we have:
+		//              B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
+		// 1st Short:                                  |xxxxxxxxxx---------------------|
+		// 2nd Short:                  |xxxx---------------------xxxxxx|
+		// 3rd Short: |--------------------xxxxxxxxxxxx|
+		// These memory reads overlap, but each of them contains an entire channel, so we can read this without
+		// any int -> short conversion.
+		c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
+		c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
+		c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
+		c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));         // R's mantissa (bits 0-8)
+		c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);  // G's mantissa (bits 9-17)
+		c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
+		c *= Float4(
+			// 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
+			Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
+			// Since the 9 bit mantissa values currently stored in RGB were converted straight
+			// from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
+			// are (1 << 9) times too high.
+			// Also, the exponent has 5 bits and we compute the exponent bias of floating point
+			// formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
+			// Exponent bias (15) + number of mantissa bits per component (9) = 24
+			Float(1.0f / (1 << 24)));
+		c.w = 1.0f;
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
+		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
+		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
+		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
+		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
+		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
+		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
+		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
+		break;
+	case VK_FORMAT_D16_UNORM:
+		c.x = Float(Int((*Pointer<UShort>(element))));
+		break;
+	case VK_FORMAT_X8_D24_UNORM_PACK32:
+		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
+		break;
+	case VK_FORMAT_D32_SFLOAT:
+		c.x = *Pointer<Float>(element);
+		break;
+	case VK_FORMAT_S8_UINT:
+		c.x = Float(Int(*Pointer<Byte>(element)));
+		break;
+	default:
+		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+	}
+
+	return c;
+}
+
+void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
+{
+	bool writeR = state.writeRed;
+	bool writeG = state.writeGreen;
+	bool writeB = state.writeBlue;
+	bool writeA = state.writeAlpha;
+	bool writeRGBA = writeR && writeG && writeB && writeA;
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_R4G4_UNORM_PACK8:
+		if(writeR | writeG)
+		{
+			if(!writeR)
+			{
+				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
+			                              (*Pointer<Byte>(element) & Byte(0xF0));
+			}
+			else if(!writeG)
+			{
+				*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
+			                              (Byte(RoundInt(Float(c.x))) << Byte(4));
+			}
+			else
+			{
+				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
+			                              (Byte(RoundInt(Float(c.x))) << Byte(4));
+			}
+		}
+		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		if(writeR || writeG || writeB || writeA)
+		{
+			*Pointer<UShort>(element) = (writeR ? ((UShort(RoundInt(Float(c.x))) & UShort(0xF)) << UShort(12)) :
+			                                      (*Pointer<UShort>(element) & UShort(0x000F))) |
+			                            (writeG ? ((UShort(RoundInt(Float(c.y))) & UShort(0xF)) << UShort(8)) :
+			                                      (*Pointer<UShort>(element) & UShort(0x00F0))) |
+			                            (writeB ? ((UShort(RoundInt(Float(c.z))) & UShort(0xF)) << UShort(4)) :
+		                                          (*Pointer<UShort>(element) & UShort(0x0F00))) |
+		                                (writeA ? (UShort(RoundInt(Float(c.w))) & UShort(0xF)) :
+		                                          (*Pointer<UShort>(element) & UShort(0xF000)));
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) & Int(0xF)) |
+			                            UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
+			                            UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
+			                            UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12);
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x000F : 0x0000) |
+			                      (writeR ? 0x00F0 : 0x0000) |
+			                      (writeG ? 0x0F00 : 0x0000) |
+			                      (writeB ? 0xF000 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            ((UShort(RoundInt(Float(c.w)) & Int(0xF)) |
+			                              UShort((RoundInt(Float(c.x)) & Int(0xF)) << 4) |
+			                              UShort((RoundInt(Float(c.y)) & Int(0xF)) << 8) |
+			                              UShort((RoundInt(Float(c.z)) & Int(0xF)) << 12)) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_B8G8R8A8_UNORM:
+		if(writeRGBA)
+		{
+			Short4 c0 = RoundShort4(c.zyxw);
+			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+		}
+		else
+		{
+			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_B8G8R8_SNORM:
+		if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_B8G8R8_UNORM:
+	case VK_FORMAT_B8G8R8_SRGB:
+		if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+		if(writeRGBA)
+		{
+			Short4 c0 = RoundShort4(c);
+			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32A32_SFLOAT:
+		if(writeRGBA)
+		{
+			*Pointer<Float4>(element) = c;
+		}
+		else
+		{
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+			if(writeA) { *Pointer<Float>(element + 12) = c.w; }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SFLOAT:
+		if(writeR) { *Pointer<Float>(element) = c.x; }
+		if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+		if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+		break;
+	case VK_FORMAT_R32G32_SFLOAT:
+		if(writeR && writeG)
+		{
+			*Pointer<Float2>(element) = Float2(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+		}
+		break;
+	case VK_FORMAT_R32_SFLOAT:
+		if(writeR) { *Pointer<Float>(element) = c.x; }
+		break;
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
+	case VK_FORMAT_R16G16B16_SFLOAT:
+		if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
+	case VK_FORMAT_R16G16_SFLOAT:
+		if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
+	case VK_FORMAT_R16_SFLOAT:
+		if(writeR) { *Pointer<Half>(element) = Half(c.x); }
+		break;
+	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		{
+			// 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+			// Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
+
+			// FIXME(b/138944025): Handle negative values, Inf, and NaN.
+			// FIXME(b/138944025): Perform rounding before truncating the mantissa.
+			UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
+			UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
+			UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
+
+			UInt rgb = r | g | b;
+
+			UInt old = *Pointer<UInt>(element);
+
+			unsigned int mask = (writeR ? 0x000007FF : 0) |
+			                    (writeG ? 0x003FF800 : 0) |
+			                    (writeB ? 0xFFC00000 : 0);
+
+			*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
+		}
+		break;
+	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+		{
+			ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
+
+			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
+
+			constexpr int N = 9;       // number of mantissa bits per component
+			constexpr int B = 15;      // exponent bias
+			constexpr int E_max = 31;  // maximum possible biased exponent value
+
+			// Maximum representable value.
+			constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
+
+			// Clamp components to valid range. NaN becomes 0.
+			Float red_c =   Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
+			Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
+			Float blue_c =  Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
+
+			// We're reducing the mantissa to 9 bits, so we must round up if the next
+			// bit is 1. In other words add 0.5 to the new mantissa's position and
+			// allow overflow into the exponent so we can scale correctly.
+			constexpr int half = 1 << (23 - N);
+			Float red_r = As<Float>(As<Int>(red_c) + half);
+			Float green_r = As<Float>(As<Int>(green_c) + half);
+			Float blue_r = As<Float>(As<Int>(blue_c) + half);
+
+			// The largest component determines the shared exponent. It can't be lower
+			// than 0 (after bias subtraction) so also limit to the mimimum representable.
+			constexpr float min_s = 0.5f / (1 << B);
+			Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
+
+			// Obtain the reciprocal of the shared exponent by inverting the bits,
+			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
+			// format has an implicit leading 1, but this shared component format does not.
+			Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
+
+			UInt R9 = RoundInt(red_c * scale);
+			UInt G9 = UInt(RoundInt(green_c * scale));
+			UInt B9 = UInt(RoundInt(blue_c * scale));
+			UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
+
+			UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
+
+			*Pointer<UInt>(element) = E5B9G9R9;
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_SNORM:
+		if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+	case VK_FORMAT_R8G8B8A8_SNORM:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+	case VK_FORMAT_R8G8B8_SINT:
+	case VK_FORMAT_R8G8B8_SNORM:
+	case VK_FORMAT_R8G8B8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SNORM:
+	case VK_FORMAT_R8G8_SSCALED:
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SNORM:
+	case VK_FORMAT_R8_SSCALED:
+		if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8B8_UNORM:
+	case VK_FORMAT_R8G8B8_USCALED:
+	case VK_FORMAT_R8G8B8_SRGB:
+		if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8G8_SRGB:
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_R8_SRGB:
+		if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_SNORM:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+		if(writeRGBA)
+		{
+			*Pointer<Short4>(element) = Short4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16B16_SNORM:
+	case VK_FORMAT_R16G16B16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+		if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+		if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+		break;
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SNORM:
+	case VK_FORMAT_R16G16_SSCALED:
+		if(writeR && writeG)
+		{
+			*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
+		}
+		else
+		{
+			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+		}
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SNORM:
+	case VK_FORMAT_R16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_UNORM:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+		if(writeRGBA)
+		{
+			*Pointer<UShort4>(element) = UShort4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16B16_UNORM:
+	case VK_FORMAT_R16G16B16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+		if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+		if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+		break;
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_UNORM:
+	case VK_FORMAT_R16G16_USCALED:
+		if(writeR && writeG)
+		{
+			*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+		}
+		break;
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_UNORM:
+	case VK_FORMAT_R16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		if(writeRGBA)
+		{
+			*Pointer<Int4>(element) = RoundInt(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+			if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SINT:
+		if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+	case VK_FORMAT_R32G32_SINT:
+		if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+	case VK_FORMAT_R32_SINT:
+		if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		if(writeRGBA)
+		{
+			*Pointer<UInt4>(element) = UInt4(RoundInt(c));
+		}
+		else
+		{
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_UINT:
+		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+	case VK_FORMAT_R32G32_UINT:
+		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+	case VK_FORMAT_R32_UINT:
+		if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		if(writeR && writeG && writeB)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+			                                  (RoundInt(Float(c.y)) << Int(5)) |
+			                                  (RoundInt(Float(c.x)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.z)) |
+			                                   (RoundInt(Float(c.y)) << Int(5)) |
+			                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
+			                                  (RoundInt(Float(c.z)) << Int(1)) |
+			                                  (RoundInt(Float(c.y)) << Int(6)) |
+			                                  (RoundInt(Float(c.x)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.w)) |
+			                                   (RoundInt(Float(c.z)) << Int(1)) |
+			                                   (RoundInt(Float(c.y)) << Int(6)) |
+			                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.w)) |
+			                                  (RoundInt(Float(c.x)) << Int(1)) |
+			                                  (RoundInt(Float(c.y)) << Int(6)) |
+			                                  (RoundInt(Float(c.z)) << Int(11)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.w)) |
+			                                   (RoundInt(Float(c.x)) << Int(1)) |
+			                                   (RoundInt(Float(c.y)) << Int(6)) |
+			                                   (RoundInt(Float(c.z)) << Int(11))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		if(writeRGBA)
+		{
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+			                                  (RoundInt(Float(c.y)) << Int(5)) |
+			                                  (RoundInt(Float(c.x)) << Int(10)) |
+			                                  (RoundInt(Float(c.w)) << Int(15)));
+		}
+		else
+		{
+			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
+			                      (writeR ? 0x7C00 : 0x0000) |
+			                      (writeG ? 0x03E0 : 0x0000) |
+			                      (writeB ? 0x001F : 0x0000);
+			unsigned short unmask = ~mask;
+			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+			                            (UShort(RoundInt(Float(c.z)) |
+			                                   (RoundInt(Float(c.y)) << Int(5)) |
+			                                   (RoundInt(Float(c.x)) << Int(10)) |
+			                                   (RoundInt(Float(c.w)) << Int(15))) & UShort(mask));
+		}
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
+			                              (RoundInt(Float(c.y)) << 10) |
+			                              (RoundInt(Float(c.z)) << 20) |
+			                              (RoundInt(Float(c.w)) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeB ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeR ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+			                            (UInt(RoundInt(Float(c.x)) |
+			                                 (RoundInt(Float(c.y)) << 10) |
+			                                 (RoundInt(Float(c.z)) << 20) |
+			                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.z)) |
+			                              (RoundInt(Float(c.y)) << 10) |
+			                              (RoundInt(Float(c.x)) << 20) |
+			                              (RoundInt(Float(c.w)) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeR ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeB ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+			                            (UInt(RoundInt(Float(c.z)) |
+			                                 (RoundInt(Float(c.y)) << 10) |
+			                                 (RoundInt(Float(c.x)) << 20) |
+			                                 (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_D16_UNORM:
+		*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
+		break;
+	case VK_FORMAT_X8_D24_UNORM_PACK32:
+		*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
+		break;
+	case VK_FORMAT_D32_SFLOAT:
+		*Pointer<Float>(element) = c.x;
+		break;
+	case VK_FORMAT_S8_UINT:
+		*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+		break;
+	default:
+		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
+		break;
+	}
+}
+
+Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
+{
+	Int4 c(0, 0, 0, 1);
+
+	switch(state.sourceFormat)
+	{
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
+		c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
+	case VK_FORMAT_R8G8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
+	case VK_FORMAT_R8_SINT:
+		c = Insert(c, Int(*Pointer<SByte>(element)), 0);
+		break;
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
+		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
+		c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
+	case VK_FORMAT_R8G8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_S8_UINT:
+		c = Insert(c, Int(*Pointer<Byte>(element)), 0);
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
+		c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
+	case VK_FORMAT_R16G16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
+	case VK_FORMAT_R16_SINT:
+		c = Insert(c, Int(*Pointer<Short>(element)), 0);
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
+		c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
+	case VK_FORMAT_R16G16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
+	case VK_FORMAT_R16_UINT:
+		c = Insert(c, Int(*Pointer<UShort>(element)), 0);
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+	case VK_FORMAT_R32G32B32A32_UINT:
+		c = *Pointer<Int4>(element);
+		break;
+	case VK_FORMAT_R32G32_SINT:
+	case VK_FORMAT_R32G32_UINT:
+		c = Insert(c, *Pointer<Int>(element + 4), 1);
+	case VK_FORMAT_R32_SINT:
+	case VK_FORMAT_R32_UINT:
+		c = Insert(c, *Pointer<Int>(element), 0);
+		break;
+	default:
+		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+	}
+
+	return c;
+}
+
+void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
+{
+	bool writeR = state.writeRed;
+	bool writeG = state.writeGreen;
+	bool writeB = state.writeBlue;
+	bool writeA = state.writeAlpha;
+	bool writeRGBA = writeR && writeG && writeB && writeA;
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_R8G8B8_USCALED:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_S8_UINT:
+		c = Min(As<UInt4>(c), UInt4(0xFF));
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+	case VK_FORMAT_R16G16B16_USCALED:
+	case VK_FORMAT_R16G16_USCALED:
+	case VK_FORMAT_R16_USCALED:
+		c = Min(As<UInt4>(c), UInt4(0xFFFF));
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_R8G8B8_SSCALED:
+	case VK_FORMAT_R8G8_SSCALED:
+	case VK_FORMAT_R8_SSCALED:
+		c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+	case VK_FORMAT_R16G16B16_SSCALED:
+	case VK_FORMAT_R16G16_SSCALED:
+	case VK_FORMAT_R16_SSCALED:
+		c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
+		break;
+	default:
+		break;
+	}
+
+	switch(state.destFormat)
+	{
+	case VK_FORMAT_B8G8R8A8_SINT:
+	case VK_FORMAT_B8G8R8A8_SSCALED:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+	case VK_FORMAT_B8G8R8_SINT:
+	case VK_FORMAT_B8G8R8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+		if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_SINT:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
+		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+	case VK_FORMAT_R8G8B8_SINT:
+	case VK_FORMAT_R8G8B8_SSCALED:
+		if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
+	case VK_FORMAT_R8G8_SINT:
+	case VK_FORMAT_R8G8_SSCALED:
+		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+	case VK_FORMAT_R8_SINT:
+	case VK_FORMAT_R8_SSCALED:
+		if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
+	case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
+	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) =
+				UInt((Extract(c, 0)) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeB ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeR ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+				(UInt(Extract(c, 0) | (Extract(c, 1) << 10) | (Extract(c, 2) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+	case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
+	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+		if(writeRGBA)
+		{
+			*Pointer<UInt>(element) =
+				UInt((Extract(c, 2)) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30));
+		}
+		else
+		{
+			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+			                    (writeR ? 0x3FF00000 : 0x0000) |
+			                    (writeG ? 0x000FFC00 : 0x0000) |
+			                    (writeB ? 0x000003FF : 0x0000);
+			unsigned int unmask = ~mask;
+			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+				(UInt(Extract(c, 2) | (Extract(c, 1) << 10) | (Extract(c, 0) << 20) | (Extract(c, 3) << 30)) & UInt(mask));
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UINT:
+	case VK_FORMAT_B8G8R8A8_USCALED:
+		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+	case VK_FORMAT_B8G8R8_UINT:
+	case VK_FORMAT_B8G8R8_USCALED:
+	case VK_FORMAT_B8G8R8_SRGB:
+		if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+		if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+	case VK_FORMAT_R8G8B8A8_UINT:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+	case VK_FORMAT_R8G8B8_UINT:
+	case VK_FORMAT_R8G8B8_USCALED:
+		if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
+	case VK_FORMAT_R8G8_UINT:
+	case VK_FORMAT_R8G8_USCALED:
+		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+	case VK_FORMAT_R8_UINT:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_S8_UINT:
+		if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R16G16B16A16_SINT:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+		if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
+	case VK_FORMAT_R16G16B16_SINT:
+	case VK_FORMAT_R16G16B16_SSCALED:
+		if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16_SSCALED:
+		if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16_SSCALED:
+		if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R16G16B16A16_UINT:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+		if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
+	case VK_FORMAT_R16G16B16_UINT:
+	case VK_FORMAT_R16G16B16_USCALED:
+		if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
+	case VK_FORMAT_R16G16_UINT:
+	case VK_FORMAT_R16G16_USCALED:
+		if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
+	case VK_FORMAT_R16_UINT:
+	case VK_FORMAT_R16_USCALED:
+		if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
+		break;
+	case VK_FORMAT_R32G32B32A32_SINT:
+		if(writeRGBA)
+		{
+			*Pointer<Int4>(element) = c;
+		}
+		else
+		{
+			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+			if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+		if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+		break;
+	case VK_FORMAT_R32G32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+		break;
+	case VK_FORMAT_R32_SINT:
+		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+		break;
+	case VK_FORMAT_R32G32B32A32_UINT:
+		if(writeRGBA)
+		{
+			*Pointer<UInt4>(element) = As<UInt4>(c);
+		}
+		else
+		{
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
+		}
+		break;
+	case VK_FORMAT_R32G32B32_UINT:
+		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+	case VK_FORMAT_R32G32_UINT:
+		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+	case VK_FORMAT_R32_UINT:
+		if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+		break;
+	default:
+		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
+	}
+}
+
+void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
+{
+	float4 scale{}, unscale{};
+
+	if(state.clearOperation &&
+	   state.sourceFormat.isNonNormalizedInteger() &&
+	   !state.destFormat.isNonNormalizedInteger())
+	{
+		// If we're clearing a buffer from an int or uint color into a normalized color,
+		// then the whole range of the int or uint color must be scaled between 0 and 1.
+		switch(state.sourceFormat)
+		{
+		case VK_FORMAT_R32G32B32A32_SINT:
+			unscale = replicate(static_cast<float>(0x7FFFFFFF));
+			break;
+		case VK_FORMAT_R32G32B32A32_UINT:
+			unscale = replicate(static_cast<float>(0xFFFFFFFF));
+			break;
+		default:
+			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
+		}
+	}
+	else
+	{
+		unscale = state.sourceFormat.getScale();
+	}
+
+	scale = state.destFormat.getScale();
+
+	bool srcSRGB = state.sourceFormat.isSRGBformat();
+	bool dstSRGB = state.destFormat.isSRGBformat();
+
+	if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))   // One of the formats is sRGB encoded.
+	{
+		value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
+		                     Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
+		value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
+		value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
+	}
+	else if(unscale != scale)
+	{
+		value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
+	}
+
+	if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
+	{
+		value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
+
+		value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
+		                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
+		                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
+		                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
+	}
+}
+
+Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
+{
+	return y * pitchB + x * bytes;
+}
+
+Float4 Blitter::LinearToSRGB(Float4 &c)
+{
+	Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
+	Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
+
+	Float4 s = c;
+	s.xyz = Max(lc, ec);
+
+	return s;
+}
+
+Float4 Blitter::sRGBtoLinear(Float4 &c)
+{
+	Float4 lc = c * Float4(1.0f / 12.92f);
+	Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
+
+	Int4 linear = CmpLT(c, Float4(0.04045f));
+
+	Float4 s = c;
+	s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));   // TODO: IfThenElse()
+
+	return s;
+}
+
+Blitter::BlitRoutineType Blitter::generate(const State &state)
+{
+	BlitFunction function;
+	{
+		Pointer<Byte> blit(function.Arg<0>());
+
+		Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
+		Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
+		Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
+		Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
+
+		Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
+		Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
+		Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
+		Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
+
+		Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
+		Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
+		Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
+		Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
+
+		Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
+		Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
+
+		bool intSrc = state.sourceFormat.isNonNormalizedInteger();
+		bool intDst = state.destFormat.isNonNormalizedInteger();
+		bool intBoth = intSrc && intDst;
+		int srcBytes = state.sourceFormat.bytes();
+		int dstBytes = state.destFormat.bytes();
+
+		bool hasConstantColorI = false;
+		Int4 constantColorI;
+		bool hasConstantColorF = false;
+		Float4 constantColorF;
+		if(state.clearOperation)
+		{
+			if(intBoth) // Integer types
+			{
+				constantColorI = readInt4(source, state);
+				hasConstantColorI = true;
+			}
+			else
+			{
+				constantColorF = readFloat4(source, state);
+				hasConstantColorF = true;
+
+				ApplyScaleAndClamp(constantColorF, state);
+			}
+		}
+
+		For(Int j = y0d, j < y1d, j++)
+		{
+			Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
+			Pointer<Byte> destLine = dest + j * dPitchB;
+
+			For(Int i = x0d, i < x1d, i++)
+			{
+				Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
+				Pointer<Byte> d = destLine + i * dstBytes;
+
+				if(hasConstantColorI)
+				{
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(constantColorI, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+					}
+				}
+				else if(hasConstantColorF)
+				{
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(constantColorF, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+					}
+				}
+				else if(intBoth) // Integer types do not support filtering
+				{
+					Int X = Int(x);
+					Int Y = Int(y);
+
+					if(state.clampToEdge)
+					{
+						X = Clamp(X, 0, sWidth - 1);
+						Y = Clamp(Y, 0, sHeight - 1);
+					}
+
+					Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
+
+					// When both formats are true integer types, we don't go to float to avoid losing precision
+					Int4 color = readInt4(s, state);
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(color, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+					}
 				}
 				else
 				{
-					constantColorF = readFloat4(source, state);
-					hasConstantColorF = true;
+					Float4 color;
 
-					ApplyScaleAndClamp(constantColorF, state);
-				}
-			}
-
-			For(Int j = y0d, j < y1d, j++)
-			{
-				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
-				Pointer<Byte> destLine = dest + j * dPitchB;
-
-				For(Int i = x0d, i < x1d, i++)
-				{
-					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
-					Pointer<Byte> d = destLine + i * dstBytes;
-
-					if(hasConstantColorI)
-					{
-						for(int s = 0; s < state.destSamples; s++)
-						{
-							write(constantColorI, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
-						}
-					}
-					else if(hasConstantColorF)
-					{
-						for(int s = 0; s < state.destSamples; s++)
-						{
-							write(constantColorF, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
-						}
-					}
-					else if(intBoth) // Integer types do not support filtering
+					bool preScaled = false;
+					if(!state.filter || intSrc)
 					{
 						Int X = Int(x);
 						Int Y = Int(y);
@@ -1465,611 +1492,585 @@
 
 						Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
 
-						// When both formats are true integer types, we don't go to float to avoid losing precision
-						Int4 color = readInt4(s, state);
-						for(int s = 0; s < state.destSamples; s++)
+						color = readFloat4(s, state);
+
+						if(state.srcSamples > 1) // Resolve multisampled source
 						{
-							write(color, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
-						}
-					}
-					else
-					{
-						Float4 color;
-
-						bool preScaled = false;
-						if(!state.filter || intSrc)
-						{
-							Int X = Int(x);
-							Int Y = Int(y);
-
-							if(state.clampToEdge)
+							if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 							{
-								X = Clamp(X, 0, sWidth - 1);
-								Y = Clamp(Y, 0, sHeight - 1);
+								ApplyScaleAndClamp(color, state);
+								preScaled = true;
 							}
-
-							Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes);
-
-							color = readFloat4(s, state);
-
-							if(state.srcSamples > 1) // Resolve multisampled source
+							Float4 accum = color;
+							for(int sample = 1; sample < state.srcSamples; sample++)
 							{
+								s += *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
+								color = readFloat4(s, state);
+
 								if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 								{
 									ApplyScaleAndClamp(color, state);
 									preScaled = true;
 								}
-								Float4 accum = color;
-								for(int sample = 1; sample < state.srcSamples; sample++)
-								{
-									s += *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
-									color = readFloat4(s, state);
-
-									if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
-									{
-										ApplyScaleAndClamp(color, state);
-										preScaled = true;
-									}
-									accum += color;
-								}
-								color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
+								accum += color;
 							}
+							color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
 						}
-						else   // Bilinear filtering
+					}
+					else   // Bilinear filtering
+					{
+						Float X = x;
+						Float Y = y;
+
+						if(state.clampToEdge)
 						{
-							Float X = x;
-							Float Y = y;
-
-							if(state.clampToEdge)
-							{
-								X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
-								Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
-							}
-
-							Float x0 = X - 0.5f;
-							Float y0 = Y - 0.5f;
-
-							Int X0 = Max(Int(x0), 0);
-							Int Y0 = Max(Int(y0), 0);
-
-							Int X1 = X0 + 1;
-							Int Y1 = Y0 + 1;
-							X1 = IfThenElse(X1 >= sWidth, X0, X1);
-							Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
-
-							Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes);
-							Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes);
-							Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes);
-							Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes);
-
-							Float4 c00 = readFloat4(s00, state);
-							Float4 c01 = readFloat4(s01, state);
-							Float4 c10 = readFloat4(s10, state);
-							Float4 c11 = readFloat4(s11, state);
-
-							if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
-							{
-								ApplyScaleAndClamp(c00, state);
-								ApplyScaleAndClamp(c01, state);
-								ApplyScaleAndClamp(c10, state);
-								ApplyScaleAndClamp(c11, state);
-								preScaled = true;
-							}
-
-							Float4 fx = Float4(x0 - Float(X0));
-							Float4 fy = Float4(y0 - Float(Y0));
-							Float4 ix = Float4(1.0f) - fx;
-							Float4 iy = Float4(1.0f) - fy;
-
-							color = (c00 * ix + c01 * fx) * iy +
-							        (c10 * ix + c11 * fx) * fy;
+							X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
+							Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
 						}
 
-						ApplyScaleAndClamp(color, state, preScaled);
+						Float x0 = X - 0.5f;
+						Float y0 = Y - 0.5f;
 
-						for(int s = 0; s < state.destSamples; s++)
+						Int X0 = Max(Int(x0), 0);
+						Int Y0 = Max(Int(y0), 0);
+
+						Int X1 = X0 + 1;
+						Int Y1 = Y0 + 1;
+						X1 = IfThenElse(X1 >= sWidth, X0, X1);
+						Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
+
+						Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes);
+						Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes);
+						Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes);
+						Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes);
+
+						Float4 c00 = readFloat4(s00, state);
+						Float4 c01 = readFloat4(s01, state);
+						Float4 c10 = readFloat4(s10, state);
+						Float4 c11 = readFloat4(s11, state);
+
+						if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
 						{
-							write(color, d, state);
-
-							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+							ApplyScaleAndClamp(c00, state);
+							ApplyScaleAndClamp(c01, state);
+							ApplyScaleAndClamp(c10, state);
+							ApplyScaleAndClamp(c11, state);
+							preScaled = true;
 						}
+
+						Float4 fx = Float4(x0 - Float(X0));
+						Float4 fy = Float4(y0 - Float(Y0));
+						Float4 ix = Float4(1.0f) - fx;
+						Float4 iy = Float4(1.0f) - fy;
+
+						color = (c00 * ix + c01 * fx) * iy +
+						        (c10 * ix + c11 * fx) * fy;
+					}
+
+					ApplyScaleAndClamp(color, state, preScaled);
+
+					for(int s = 0; s < state.destSamples; s++)
+					{
+						write(color, d, state);
+
+						d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
 					}
 				}
 			}
 		}
-
-		return function("BlitRoutine");
 	}
 
-	Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
+	return function("BlitRoutine");
+}
+
+Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
+{
+	std::unique_lock<std::mutex> lock(blitMutex);
+	auto blitRoutine = blitCache.query(state);
+
+	if(!blitRoutine)
 	{
-		std::unique_lock<std::mutex> lock(blitMutex);
-		auto blitRoutine = blitCache.query(state);
-
-		if(!blitRoutine)
-		{
-			blitRoutine = generate(state);
-			blitCache.add(state, blitRoutine);
-		}
-
-		return blitRoutine;
+		blitRoutine = generate(state);
+		blitCache.add(state, blitRoutine);
 	}
 
-	Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
+	return blitRoutine;
+}
+
+Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
+{
+	std::unique_lock<std::mutex> lock(cornerUpdateMutex);
+	auto cornerUpdateRoutine = cornerUpdateCache.query(state);
+
+	if(!cornerUpdateRoutine)
 	{
-		std::unique_lock<std::mutex> lock(cornerUpdateMutex);
-		auto cornerUpdateRoutine = cornerUpdateCache.query(state);
-
-		if(!cornerUpdateRoutine)
-		{
-			cornerUpdateRoutine = generateCornerUpdate(state);
-			cornerUpdateCache.add(state, cornerUpdateRoutine);
-		}
-
-		return cornerUpdateRoutine;
+		cornerUpdateRoutine = generateCornerUpdate(state);
+		cornerUpdateCache.add(state, cornerUpdateRoutine);
 	}
 
-	void Blitter::blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch)
+	return cornerUpdateRoutine;
+}
+
+void Blitter::blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch)
+{
+	auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
+	auto format = src->getFormat(aspect);
+	State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
 	{
-		auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
-		auto format = src->getFormat(aspect);
-		State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
-
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
-
-		BlitData data =
-		{
-			nullptr, // source
-			dst, // dest
-			src->rowPitchBytes(aspect, subresource.mipLevel),   // sPitchB
-			bufferRowPitch,   // dPitchB
-			src->slicePitchBytes(aspect, subresource.mipLevel), // sSliceB
-			bufferSlicePitch, // dSliceB
-
-			0, 0, 1, 1,
-
-			0, // y0d
-			static_cast<int>(extent.height), // y1d
-			0, // x0d
-			static_cast<int>(extent.width), // x1d
-
-			static_cast<int>(extent.width), // sWidth
-			static_cast<int>(extent.height) // sHeight;
-		};
-
-		VkOffset3D srcOffset = { 0, 0, offset.z };
-
-		VkImageSubresourceLayers srcSubresLayers = subresource;
-		srcSubresLayers.layerCount = 1;
-
-		VkImageSubresourceRange srcSubresRange =
-		{
-			subresource.aspectMask,
-			subresource.mipLevel,
-			1,
-			subresource.baseArrayLayer,
-			subresource.layerCount
-		};
-
-		uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
-
-		for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++)
-		{
-			srcOffset.z = offset.z;
-
-			for(auto i = 0u; i < extent.depth; i++)
-			{
-				data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
-				ASSERT(data.source < src->end());
-				blitRoutine(&data);
-				srcOffset.z++;
-				data.dest = (dst += bufferSlicePitch);
-			}
-		}
+		return;
 	}
 
-	void Blitter::blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch)
+	BlitData data =
 	{
-		auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
-		auto format = dst->getFormat(aspect);
-		State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+		nullptr, // source
+		dst, // dest
+		src->rowPitchBytes(aspect, subresource.mipLevel),   // sPitchB
+		bufferRowPitch,   // dPitchB
+		src->slicePitchBytes(aspect, subresource.mipLevel), // sSliceB
+		bufferSlicePitch, // dSliceB
 
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
+		0, 0, 1, 1,
 
-		BlitData data =
-		{
-			src, // source
-			nullptr, // dest
-			bufferRowPitch,   // sPitchB
-			dst->rowPitchBytes(aspect, subresource.mipLevel),   // dPitchB
-			bufferSlicePitch, // sSliceB
-			dst->slicePitchBytes(aspect, subresource.mipLevel), // dSliceB
+		0, // y0d
+		static_cast<int>(extent.height), // y1d
+		0, // x0d
+		static_cast<int>(extent.width), // x1d
 
-			static_cast<float>(-offset.x), // x0
-			static_cast<float>(-offset.y), // y0
-			1.0f, // w
-			1.0f, // h
+		static_cast<int>(extent.width), // sWidth
+		static_cast<int>(extent.height) // sHeight;
+	};
 
-			offset.y, // y0d
-			static_cast<int>(offset.y + extent.height), // y1d
-			offset.x, // x0d
-			static_cast<int>(offset.x + extent.width), // x1d
+	VkOffset3D srcOffset = { 0, 0, offset.z };
 
-			static_cast<int>(extent.width), // sWidth
-			static_cast<int>(extent.height) // sHeight;
-		};
+	VkImageSubresourceLayers srcSubresLayers = subresource;
+	srcSubresLayers.layerCount = 1;
 
-		VkOffset3D dstOffset = { 0, 0, offset.z };
-
-		VkImageSubresourceLayers dstSubresLayers = subresource;
-		dstSubresLayers.layerCount = 1;
-
-		VkImageSubresourceRange dstSubresRange =
-		{
-			subresource.aspectMask,
-			subresource.mipLevel,
-			1,
-			subresource.baseArrayLayer,
-			subresource.layerCount
-		};
-
-		uint32_t lastLayer = dst->getLastLayerIndex(dstSubresRange);
-
-		for(; dstSubresLayers.baseArrayLayer <= lastLayer; dstSubresLayers.baseArrayLayer++)
-		{
-			dstOffset.z = offset.z;
-
-			for(auto i = 0u; i < extent.depth; i++)
-			{
-				data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
-				ASSERT(data.dest < dst->end());
-				blitRoutine(&data);
-				dstOffset.z++;
-				data.source = (src += bufferSlicePitch);
-			}
-		}
-	}
-
-	void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter)
+	VkImageSubresourceRange srcSubresRange =
 	{
-		if(dst->getFormat() == VK_FORMAT_UNDEFINED)
-		{
-			return;
-		}
+		subresource.aspectMask,
+		subresource.mipLevel,
+		1,
+		subresource.baseArrayLayer,
+		subresource.layerCount
+	};
 
-		if((region.srcSubresource.layerCount != region.dstSubresource.layerCount) ||
-		   (region.srcSubresource.aspectMask != region.dstSubresource.aspectMask))
-		{
-			UNIMPLEMENTED("region");
-		}
+	uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
 
-		if(region.dstOffsets[0].x > region.dstOffsets[1].x)
-		{
-			std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
-			std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
-		}
-
-		if(region.dstOffsets[0].y > region.dstOffsets[1].y)
-		{
-			std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
-			std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
-		}
-
-		VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
-		VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
-		VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
-
-		int32_t numSlices = (region.srcOffsets[1].z - region.srcOffsets[0].z);
-		ASSERT(numSlices == (region.dstOffsets[1].z - region.dstOffsets[0].z));
-
-		float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
-		                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
-		float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
-		                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
-		float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
-		float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
-
-		auto srcFormat = src->getFormat(srcAspect);
-		auto dstFormat = dst->getFormat(dstAspect);
-
-		bool doFilter = (filter != VK_FILTER_NEAREST);
-		bool allowSRGBConversion =
-			doFilter ||
-			(src->getSampleCountFlagBits() > 1) ||
-			(srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
-
-		State state(src->getFormat(srcAspect), dst->getFormat(dstAspect), src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
-		            Options{ doFilter, allowSRGBConversion });
-		state.clampToEdge = (region.srcOffsets[0].x < 0) ||
-		                    (region.srcOffsets[0].y < 0) ||
-		                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
-		                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
-		                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
-
-		auto blitRoutine = getBlitRoutine(state);
-		if(!blitRoutine)
-		{
-			return;
-		}
-
-		BlitData data =
-		{
-			nullptr, // source
-			nullptr, // dest
-			src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel),   // sPitchB
-			dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel),   // dPitchB
-			src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel), // sSliceB
-			dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel), // dSliceB
-
-			x0,
-			y0,
-			widthRatio,
-			heightRatio,
-
-			region.dstOffsets[0].y, // y0d
-			region.dstOffsets[1].y, // y1d
-			region.dstOffsets[0].x, // x0d
-			region.dstOffsets[1].x, // x1d
-
-			static_cast<int>(srcExtent.width), // sWidth
-			static_cast<int>(srcExtent.height) // sHeight;
-		};
-
-		VkOffset3D srcOffset = { 0, 0, region.srcOffsets[0].z };
-		VkOffset3D dstOffset = { 0, 0, region.dstOffsets[0].z };
-
-		VkImageSubresourceLayers srcSubresLayers =
-		{
-			region.srcSubresource.aspectMask,
-			region.srcSubresource.mipLevel,
-			region.srcSubresource.baseArrayLayer,
-			1
-		};
-
-		VkImageSubresourceLayers dstSubresLayers =
-		{
-			region.dstSubresource.aspectMask,
-			region.dstSubresource.mipLevel,
-			region.dstSubresource.baseArrayLayer,
-			1
-		};
-
-		VkImageSubresourceRange srcSubresRange =
-		{
-			region.srcSubresource.aspectMask,
-			region.srcSubresource.mipLevel,
-			1,
-			region.srcSubresource.baseArrayLayer,
-			region.srcSubresource.layerCount
-		};
-
-		uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
-
-		for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++, dstSubresLayers.baseArrayLayer++)
-		{
-			srcOffset.z = region.srcOffsets[0].z;
-			dstOffset.z = region.dstOffsets[0].z;
-
-			for(int i = 0; i < numSlices; i++)
-			{
-				data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
-				data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
-
-				ASSERT(data.source < src->end());
-				ASSERT(data.dest < dst->end());
-
-				blitRoutine(&data);
-				srcOffset.z++;
-				dstOffset.z++;
-			}
-		}
-	}
-
-	void Blitter::computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state)
+	for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++)
 	{
-		int bytes = state.sourceFormat.bytes();
+		srcOffset.z = offset.z;
 
-		Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
-		           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
-		           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
-
-		c *= Float4(1.0f / 3.0f);
-
-		write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
-	}
-
-	Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State& state)
-	{
-		// Reading and writing from/to the same image
-		ASSERT(state.sourceFormat == state.destFormat);
-		ASSERT(state.srcSamples == state.destSamples);
-
-		if(state.srcSamples != 1)
+		for(auto i = 0u; i < extent.depth; i++)
 		{
-			UNIMPLEMENTED("state.srcSamples %d", state.srcSamples);
-		}
-
-		CornerUpdateFunction function;
-		{
-			Pointer<Byte> blit(function.Arg<0>());
-
-			Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
-			Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
-			UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
-			UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
-
-			// Low Border, Low Pixel, High Border, High Pixel
-			Int LB(-1), LP(0), HB(dim), HP(dim-1);
-
-			for(int face = 0; face < 6; face++)
-			{
-				computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
-				computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
-				computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
-				computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
-				layers = layers + layerSize;
-			}
-		}
-
-		return function("BlitRoutine");
-	}
-
-	void Blitter::updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers)
-	{
-		if(image->getArrayLayers() < (subresourceLayers.baseArrayLayer + 6))
-		{
-			UNIMPLEMENTED("image->getArrayLayers() %d, baseArrayLayer %d",
-			              image->getArrayLayers(), subresourceLayers.baseArrayLayer);
-		}
-
-		// From Vulkan 1.1 spec, section 11.5. Image Views:
-		// "For cube and cube array image views, the layers of the image view starting
-		//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
-		VkImageSubresourceLayers posX = subresourceLayers;
-		posX.layerCount = 1;
-		VkImageSubresourceLayers negX = posX;
-		negX.baseArrayLayer++;
-		VkImageSubresourceLayers posY = negX;
-		posY.baseArrayLayer++;
-		VkImageSubresourceLayers negY = posY;
-		negY.baseArrayLayer++;
-		VkImageSubresourceLayers posZ = negY;
-		posZ.baseArrayLayer++;
-		VkImageSubresourceLayers negZ = posZ;
-		negZ.baseArrayLayer++;
-
-		// Copy top / bottom
-		copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
-		copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
-		copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
-		copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
-		copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
-		copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
-
-		copyCubeEdge(image, posX, TOP, posY, RIGHT);
-		copyCubeEdge(image, posY, TOP, negZ, TOP);
-		copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
-		copyCubeEdge(image, negX, TOP, posY, LEFT);
-		copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
-		copyCubeEdge(image, negZ, TOP, posY, TOP);
-
-		// Copy left / right
-		copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
-		copyCubeEdge(image, posY, RIGHT, posX, TOP);
-		copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
-		copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
-		copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
-		copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
-
-		copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
-		copyCubeEdge(image, posY, LEFT, negX, TOP);
-		copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
-		copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
-		copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
-		copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
-
-		// Compute corner colors
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceLayers.aspectMask);
-		vk::Format format = image->getFormat(aspect);
-		VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
-		State state(format, format, samples, samples, Options{ 0xF });
-
-		if(samples != VK_SAMPLE_COUNT_1_BIT)
-		{
-			UNIMPLEMENTED("Multi-sampled cube: %d samples", static_cast<int>(samples));
-		}
-
-		auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
-		if(!cornerUpdateRoutine)
-		{
-			return;
-		}
-
-		VkExtent3D extent = image->getMipLevelExtent(aspect, subresourceLayers.mipLevel);
-		CubeBorderData data =
-		{
-			image->getTexelPointer({ 0, 0, 0 }, posX),
-			image->rowPitchBytes(aspect, subresourceLayers.mipLevel),
-			static_cast<uint32_t>(image->getLayerSize(aspect)),
-			extent.width
-		};
-		cornerUpdateRoutine(&data);
-	}
-
-	void Blitter::copyCubeEdge(vk::Image* image,
-	                           const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
-	                           const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge)
-	{
-		ASSERT(srcSubresourceLayers.aspectMask == dstSubresourceLayers.aspectMask);
-		ASSERT(srcSubresourceLayers.mipLevel == dstSubresourceLayers.mipLevel);
-		ASSERT(srcSubresourceLayers.baseArrayLayer != dstSubresourceLayers.baseArrayLayer);
-		ASSERT(srcSubresourceLayers.layerCount == 1);
-		ASSERT(dstSubresourceLayers.layerCount == 1);
-
-		// Figure out if the edges to be copied in reverse order respectively from one another
-		// The copy should be reversed whenever the same edges are contiguous or if we're
-		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
-		//
-		//      | +y |
-		// | -x | +z | +x | -z |
-		//      | -y |
-
-		bool reverse = (srcEdge == dstEdge) ||
-		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
-		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
-		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
-		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
-
-		VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresourceLayers.aspectMask);
-		int bytes = image->getFormat(aspect).bytes();
-		int pitchB = image->rowPitchBytes(aspect, srcSubresourceLayers.mipLevel);
-
-		VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresourceLayers.mipLevel);
-		int w = extent.width;
-		int h = extent.height;
-		if(w != h)
-		{
-			UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
-		}
-
-		// Src is expressed in the regular [0, width-1], [0, height-1] space
-		bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
-		int srcDelta = srcHorizontal ? bytes : pitchB;
-		VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
-
-		// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
-		bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
-		int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
-		VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
-
-		// Don't write in the corners
-		if(dstHorizontal)
-		{
-			dstOffset.x += reverse ? w : 1;
-		}
-		else
-		{
-			dstOffset.y += reverse ? h : 1;
-		}
-
-		const uint8_t* src = static_cast<const uint8_t*>(image->getTexelPointer(srcOffset, srcSubresourceLayers));
-		uint8_t *dst = static_cast<uint8_t*>(image->getTexelPointer(dstOffset, dstSubresourceLayers));
-		ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
-		ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
-
-		for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
-		{
-			memcpy(dst, src, bytes);
+			data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
+			ASSERT(data.source < src->end());
+			blitRoutine(&data);
+			srcOffset.z++;
+			data.dest = (dst += bufferSlicePitch);
 		}
 	}
 }
+
+void Blitter::blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch)
+{
+	auto aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
+	auto format = dst->getFormat(aspect);
+	State state(format, format, VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_1_BIT, Options{false, false});
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	BlitData data =
+	{
+		src, // source
+		nullptr, // dest
+		bufferRowPitch,   // sPitchB
+		dst->rowPitchBytes(aspect, subresource.mipLevel),   // dPitchB
+		bufferSlicePitch, // sSliceB
+		dst->slicePitchBytes(aspect, subresource.mipLevel), // dSliceB
+
+		static_cast<float>(-offset.x), // x0
+		static_cast<float>(-offset.y), // y0
+		1.0f, // w
+		1.0f, // h
+
+		offset.y, // y0d
+		static_cast<int>(offset.y + extent.height), // y1d
+		offset.x, // x0d
+		static_cast<int>(offset.x + extent.width), // x1d
+
+		static_cast<int>(extent.width), // sWidth
+		static_cast<int>(extent.height) // sHeight;
+	};
+
+	VkOffset3D dstOffset = { 0, 0, offset.z };
+
+	VkImageSubresourceLayers dstSubresLayers = subresource;
+	dstSubresLayers.layerCount = 1;
+
+	VkImageSubresourceRange dstSubresRange =
+	{
+		subresource.aspectMask,
+		subresource.mipLevel,
+		1,
+		subresource.baseArrayLayer,
+		subresource.layerCount
+	};
+
+	uint32_t lastLayer = dst->getLastLayerIndex(dstSubresRange);
+
+	for(; dstSubresLayers.baseArrayLayer <= lastLayer; dstSubresLayers.baseArrayLayer++)
+	{
+		dstOffset.z = offset.z;
+
+		for(auto i = 0u; i < extent.depth; i++)
+		{
+			data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
+			ASSERT(data.dest < dst->end());
+			blitRoutine(&data);
+			dstOffset.z++;
+			data.source = (src += bufferSlicePitch);
+		}
+	}
+}
+
+void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter)
+{
+	if(dst->getFormat() == VK_FORMAT_UNDEFINED)
+	{
+		return;
+	}
+
+	if((region.srcSubresource.layerCount != region.dstSubresource.layerCount) ||
+	   (region.srcSubresource.aspectMask != region.dstSubresource.aspectMask))
+	{
+		UNIMPLEMENTED("region");
+	}
+
+	if(region.dstOffsets[0].x > region.dstOffsets[1].x)
+	{
+		std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
+		std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
+	}
+
+	if(region.dstOffsets[0].y > region.dstOffsets[1].y)
+	{
+		std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
+		std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
+	}
+
+	VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
+	VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
+	VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
+
+	int32_t numSlices = (region.srcOffsets[1].z - region.srcOffsets[0].z);
+	ASSERT(numSlices == (region.dstOffsets[1].z - region.dstOffsets[0].z));
+
+	float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
+	                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
+	float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
+	                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
+	float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
+	float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
+
+	auto srcFormat = src->getFormat(srcAspect);
+	auto dstFormat = dst->getFormat(dstAspect);
+
+	bool doFilter = (filter != VK_FILTER_NEAREST);
+	bool allowSRGBConversion =
+		doFilter ||
+		(src->getSampleCountFlagBits() > 1) ||
+		(srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
+
+	State state(src->getFormat(srcAspect), dst->getFormat(dstAspect), src->getSampleCountFlagBits(), dst->getSampleCountFlagBits(),
+	            Options{ doFilter, allowSRGBConversion });
+	state.clampToEdge = (region.srcOffsets[0].x < 0) ||
+	                    (region.srcOffsets[0].y < 0) ||
+	                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
+	                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
+	                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
+
+	auto blitRoutine = getBlitRoutine(state);
+	if(!blitRoutine)
+	{
+		return;
+	}
+
+	BlitData data =
+	{
+		nullptr, // source
+		nullptr, // dest
+		src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel),   // sPitchB
+		dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel),   // dPitchB
+		src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel), // sSliceB
+		dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel), // dSliceB
+
+		x0,
+		y0,
+		widthRatio,
+		heightRatio,
+
+		region.dstOffsets[0].y, // y0d
+		region.dstOffsets[1].y, // y1d
+		region.dstOffsets[0].x, // x0d
+		region.dstOffsets[1].x, // x1d
+
+		static_cast<int>(srcExtent.width), // sWidth
+		static_cast<int>(srcExtent.height) // sHeight;
+	};
+
+	VkOffset3D srcOffset = { 0, 0, region.srcOffsets[0].z };
+	VkOffset3D dstOffset = { 0, 0, region.dstOffsets[0].z };
+
+	VkImageSubresourceLayers srcSubresLayers =
+	{
+		region.srcSubresource.aspectMask,
+		region.srcSubresource.mipLevel,
+		region.srcSubresource.baseArrayLayer,
+		1
+	};
+
+	VkImageSubresourceLayers dstSubresLayers =
+	{
+		region.dstSubresource.aspectMask,
+		region.dstSubresource.mipLevel,
+		region.dstSubresource.baseArrayLayer,
+		1
+	};
+
+	VkImageSubresourceRange srcSubresRange =
+	{
+		region.srcSubresource.aspectMask,
+		region.srcSubresource.mipLevel,
+		1,
+		region.srcSubresource.baseArrayLayer,
+		region.srcSubresource.layerCount
+	};
+
+	uint32_t lastLayer = src->getLastLayerIndex(srcSubresRange);
+
+	for(; srcSubresLayers.baseArrayLayer <= lastLayer; srcSubresLayers.baseArrayLayer++, dstSubresLayers.baseArrayLayer++)
+	{
+		srcOffset.z = region.srcOffsets[0].z;
+		dstOffset.z = region.dstOffsets[0].z;
+
+		for(int i = 0; i < numSlices; i++)
+		{
+			data.source = src->getTexelPointer(srcOffset, srcSubresLayers);
+			data.dest = dst->getTexelPointer(dstOffset, dstSubresLayers);
+
+			ASSERT(data.source < src->end());
+			ASSERT(data.dest < dst->end());
+
+			blitRoutine(&data);
+			srcOffset.z++;
+			dstOffset.z++;
+		}
+	}
+}
+
+void Blitter::computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state)
+{
+	int bytes = state.sourceFormat.bytes();
+
+	Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
+	           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
+	           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
+
+	c *= Float4(1.0f / 3.0f);
+
+	write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
+}
+
+Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State& state)
+{
+	// Reading and writing from/to the same image
+	ASSERT(state.sourceFormat == state.destFormat);
+	ASSERT(state.srcSamples == state.destSamples);
+
+	if(state.srcSamples != 1)
+	{
+		UNIMPLEMENTED("state.srcSamples %d", state.srcSamples);
+	}
+
+	CornerUpdateFunction function;
+	{
+		Pointer<Byte> blit(function.Arg<0>());
+
+		Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
+		Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
+		UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
+		UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
+
+		// Low Border, Low Pixel, High Border, High Pixel
+		Int LB(-1), LP(0), HB(dim), HP(dim-1);
+
+		for(int face = 0; face < 6; face++)
+		{
+			computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
+			computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
+			computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
+			computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
+			layers = layers + layerSize;
+		}
+	}
+
+	return function("BlitRoutine");
+}
+
+void Blitter::updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers)
+{
+	if(image->getArrayLayers() < (subresourceLayers.baseArrayLayer + 6))
+	{
+		UNIMPLEMENTED("image->getArrayLayers() %d, baseArrayLayer %d",
+		              image->getArrayLayers(), subresourceLayers.baseArrayLayer);
+	}
+
+	// From Vulkan 1.1 spec, section 11.5. Image Views:
+	// "For cube and cube array image views, the layers of the image view starting
+	//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
+	VkImageSubresourceLayers posX = subresourceLayers;
+	posX.layerCount = 1;
+	VkImageSubresourceLayers negX = posX;
+	negX.baseArrayLayer++;
+	VkImageSubresourceLayers posY = negX;
+	posY.baseArrayLayer++;
+	VkImageSubresourceLayers negY = posY;
+	negY.baseArrayLayer++;
+	VkImageSubresourceLayers posZ = negY;
+	posZ.baseArrayLayer++;
+	VkImageSubresourceLayers negZ = posZ;
+	negZ.baseArrayLayer++;
+
+	// Copy top / bottom
+	copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
+	copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
+	copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
+	copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
+	copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
+	copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
+
+	copyCubeEdge(image, posX, TOP, posY, RIGHT);
+	copyCubeEdge(image, posY, TOP, negZ, TOP);
+	copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
+	copyCubeEdge(image, negX, TOP, posY, LEFT);
+	copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
+	copyCubeEdge(image, negZ, TOP, posY, TOP);
+
+	// Copy left / right
+	copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
+	copyCubeEdge(image, posY, RIGHT, posX, TOP);
+	copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
+	copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
+	copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
+	copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
+
+	copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
+	copyCubeEdge(image, posY, LEFT, negX, TOP);
+	copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
+	copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
+	copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
+	copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
+
+	// Compute corner colors
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceLayers.aspectMask);
+	vk::Format format = image->getFormat(aspect);
+	VkSampleCountFlagBits samples = image->getSampleCountFlagBits();
+	State state(format, format, samples, samples, Options{ 0xF });
+
+	if(samples != VK_SAMPLE_COUNT_1_BIT)
+	{
+		UNIMPLEMENTED("Multi-sampled cube: %d samples", static_cast<int>(samples));
+	}
+
+	auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
+	if(!cornerUpdateRoutine)
+	{
+		return;
+	}
+
+	VkExtent3D extent = image->getMipLevelExtent(aspect, subresourceLayers.mipLevel);
+	CubeBorderData data =
+	{
+		image->getTexelPointer({ 0, 0, 0 }, posX),
+		image->rowPitchBytes(aspect, subresourceLayers.mipLevel),
+		static_cast<uint32_t>(image->getLayerSize(aspect)),
+		extent.width
+	};
+	cornerUpdateRoutine(&data);
+}
+
+void Blitter::copyCubeEdge(vk::Image* image,
+                           const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
+                           const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge)
+{
+	ASSERT(srcSubresourceLayers.aspectMask == dstSubresourceLayers.aspectMask);
+	ASSERT(srcSubresourceLayers.mipLevel == dstSubresourceLayers.mipLevel);
+	ASSERT(srcSubresourceLayers.baseArrayLayer != dstSubresourceLayers.baseArrayLayer);
+	ASSERT(srcSubresourceLayers.layerCount == 1);
+	ASSERT(dstSubresourceLayers.layerCount == 1);
+
+	// Figure out if the edges to be copied in reverse order respectively from one another
+	// The copy should be reversed whenever the same edges are contiguous or if we're
+	// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
+	//
+	//      | +y |
+	// | -x | +z | +x | -z |
+	//      | -y |
+
+	bool reverse = (srcEdge == dstEdge) ||
+	               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
+	               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
+	               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
+	               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
+
+	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresourceLayers.aspectMask);
+	int bytes = image->getFormat(aspect).bytes();
+	int pitchB = image->rowPitchBytes(aspect, srcSubresourceLayers.mipLevel);
+
+	VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresourceLayers.mipLevel);
+	int w = extent.width;
+	int h = extent.height;
+	if(w != h)
+	{
+		UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
+	}
+
+	// Src is expressed in the regular [0, width-1], [0, height-1] space
+	bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
+	int srcDelta = srcHorizontal ? bytes : pitchB;
+	VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
+
+	// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
+	bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
+	int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
+	VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
+
+	// Don't write in the corners
+	if(dstHorizontal)
+	{
+		dstOffset.x += reverse ? w : 1;
+	}
+	else
+	{
+		dstOffset.y += reverse ? h : 1;
+	}
+
+	const uint8_t* src = static_cast<const uint8_t*>(image->getTexelPointer(srcOffset, srcSubresourceLayers));
+	uint8_t *dst = static_cast<uint8_t*>(image->getTexelPointer(dstOffset, dstSubresourceLayers));
+	ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
+	ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
+
+	for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
+	{
+		memcpy(dst, src, bytes);
+	}
+}
+
+}  // namepspace sw

diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index 0157e88..317fdcc 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp

@@ -23,139 +23,141 @@
 #include <mutex>
 #include <cstring>
 
-namespace vk
-{
-	class Image;
-	class Buffer;
-}
+namespace vk {
 
-namespace sw
+class Image;
+class Buffer;
+
+}  // namespace vk
+
+namespace sw {
+
+class Blitter
 {
-	class Blitter
+	struct Options
 	{
-		struct Options
+		explicit Options() = default;
+		explicit Options(bool filter, bool allowSRGBConversion)
+			: writeMask(0xF), clearOperation(false), filter(filter), allowSRGBConversion(allowSRGBConversion), clampToEdge(false) {}
+		explicit Options(unsigned int writeMask)
+			: writeMask(writeMask), clearOperation(true), filter(false), allowSRGBConversion(true), clampToEdge(false) {}
+
+		union
 		{
-			explicit Options() = default;
-			explicit Options(bool filter, bool allowSRGBConversion)
-				: writeMask(0xF), clearOperation(false), filter(filter), allowSRGBConversion(allowSRGBConversion), clampToEdge(false) {}
-			explicit Options(unsigned int writeMask)
-				: writeMask(writeMask), clearOperation(true), filter(false), allowSRGBConversion(true), clampToEdge(false) {}
-
-			union
+			struct
 			{
-				struct
-				{
-					bool writeRed : 1;
-					bool writeGreen : 1;
-					bool writeBlue : 1;
-					bool writeAlpha : 1;
-				};
-
-				unsigned char writeMask;
+				bool writeRed : 1;
+				bool writeGreen : 1;
+				bool writeBlue : 1;
+				bool writeAlpha : 1;
 			};
 
-			bool clearOperation : 1;
-			bool filter : 1;
-			bool allowSRGBConversion : 1;
-			bool clampToEdge : 1;
+			unsigned char writeMask;
 		};
 
-		struct State : Memset<State>, Options
-		{
-			State() : Memset(this, 0) {}
-			State(const Options &options) : Memset(this, 0), Options(options) {}
-			State(vk::Format sourceFormat, vk::Format destFormat, int srcSamples, int destSamples, const Options &options) :
-				Memset(this, 0), Options(options), sourceFormat(sourceFormat), destFormat(destFormat), srcSamples(srcSamples), destSamples(destSamples) {}
-
-			bool operator==(const State &state) const
-			{
-				static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
-				return memcmp(this, &state, sizeof(State)) == 0;
-			}
-
-			vk::Format sourceFormat;
-			vk::Format destFormat;
-			int srcSamples = 0;
-			int destSamples = 0;
-		};
-
-		struct BlitData
-		{
-			void *source;
-			void *dest;
-			int sPitchB;
-			int dPitchB;
-			int sSliceB;
-			int dSliceB;
-
-			float x0;
-			float y0;
-			float w;
-			float h;
-
-			int y0d;
-			int y1d;
-			int x0d;
-			int x1d;
-
-			int sWidth;
-			int sHeight;
-		};
-
-		struct CubeBorderData
-		{
-			void *layers;
-			int pitchB;
-			uint32_t layerSize;
-			uint32_t dim;
-		};
-
-	public:
-		Blitter();
-		virtual ~Blitter();
-
-		void clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea = nullptr);
-
-		void blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter);
-		void blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch);
-		void blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch);
-
-		void updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers);
-
-	private:
-		enum Edge { TOP, BOTTOM, RIGHT, LEFT };
-
-		bool fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea);
-
-		Float4 readFloat4(Pointer<Byte> element, const State &state);
-		void write(Float4 &color, Pointer<Byte> element, const State &state);
-		Int4 readInt4(Pointer<Byte> element, const State &state);
-		void write(Int4 &color, Pointer<Byte> element, const State &state);
-		static void ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
-		static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes);
-		static Float4 LinearToSRGB(Float4 &color);
-		static Float4 sRGBtoLinear(Float4 &color);
-
-		using BlitFunction = FunctionT<void(const BlitData*)>;
-		using BlitRoutineType = BlitFunction::RoutineType;
-		BlitRoutineType getBlitRoutine(const State &state);
-		BlitRoutineType generate(const State &state);
-
-		using CornerUpdateFunction = FunctionT<void(const CubeBorderData*)>;
-		using CornerUpdateRoutineType = CornerUpdateFunction::RoutineType;
-		CornerUpdateRoutineType getCornerUpdateRoutine(const State &state);
-		CornerUpdateRoutineType generateCornerUpdate(const State& state);
-		void computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state);
-
-		void copyCubeEdge(vk::Image* image,
-	                      const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
-	                      const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge);
-
-		std::mutex blitMutex;
-		RoutineCacheT<State, BlitFunction::CFunctionType> blitCache; // guarded by blitMutex
-		std::mutex cornerUpdateMutex;
-		RoutineCacheT<State, CornerUpdateFunction::CFunctionType> cornerUpdateCache; // guarded by cornerUpdateMutex
+		bool clearOperation : 1;
+		bool filter : 1;
+		bool allowSRGBConversion : 1;
+		bool clampToEdge : 1;
 	};
-}
+
+	struct State : Memset<State>, Options
+	{
+		State() : Memset(this, 0) {}
+		State(const Options &options) : Memset(this, 0), Options(options) {}
+		State(vk::Format sourceFormat, vk::Format destFormat, int srcSamples, int destSamples, const Options &options) :
+			Memset(this, 0), Options(options), sourceFormat(sourceFormat), destFormat(destFormat), srcSamples(srcSamples), destSamples(destSamples) {}
+
+		bool operator==(const State &state) const
+		{
+			static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
+			return memcmp(this, &state, sizeof(State)) == 0;
+		}
+
+		vk::Format sourceFormat;
+		vk::Format destFormat;
+		int srcSamples = 0;
+		int destSamples = 0;
+	};
+
+	struct BlitData
+	{
+		void *source;
+		void *dest;
+		int sPitchB;
+		int dPitchB;
+		int sSliceB;
+		int dSliceB;
+
+		float x0;
+		float y0;
+		float w;
+		float h;
+
+		int y0d;
+		int y1d;
+		int x0d;
+		int x1d;
+
+		int sWidth;
+		int sHeight;
+	};
+
+	struct CubeBorderData
+	{
+		void *layers;
+		int pitchB;
+		uint32_t layerSize;
+		uint32_t dim;
+	};
+
+public:
+	Blitter();
+	virtual ~Blitter();
+
+	void clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea = nullptr);
+
+	void blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter);
+	void blitToBuffer(const vk::Image *src, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *dst, int bufferRowPitch, int bufferSlicePitch);
+	void blitFromBuffer(const vk::Image *dst, VkImageSubresourceLayers subresource, VkOffset3D offset, VkExtent3D extent, uint8_t *src, int bufferRowPitch, int bufferSlicePitch);
+
+	void updateBorders(vk::Image* image, const VkImageSubresourceLayers& subresourceLayers);
+
+private:
+	enum Edge { TOP, BOTTOM, RIGHT, LEFT };
+
+	bool fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format& viewFormat, const VkImageSubresourceRange& subresourceRange, const VkRect2D* renderArea);
+
+	Float4 readFloat4(Pointer<Byte> element, const State &state);
+	void write(Float4 &color, Pointer<Byte> element, const State &state);
+	Int4 readInt4(Pointer<Byte> element, const State &state);
+	void write(Int4 &color, Pointer<Byte> element, const State &state);
+	static void ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
+	static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes);
+	static Float4 LinearToSRGB(Float4 &color);
+	static Float4 sRGBtoLinear(Float4 &color);
+
+	using BlitFunction = FunctionT<void(const BlitData*)>;
+	using BlitRoutineType = BlitFunction::RoutineType;
+	BlitRoutineType getBlitRoutine(const State &state);
+	BlitRoutineType generate(const State &state);
+
+	using CornerUpdateFunction = FunctionT<void(const CubeBorderData*)>;
+	using CornerUpdateRoutineType = CornerUpdateFunction::RoutineType;
+	CornerUpdateRoutineType getCornerUpdateRoutine(const State &state);
+	CornerUpdateRoutineType generateCornerUpdate(const State& state);
+	void computeCubeCorner(Pointer<Byte>& layer, Int& x0, Int& x1, Int& y0, Int& y1, Int& pitchB, const State& state);
+
+	void copyCubeEdge(vk::Image* image,
+                      const VkImageSubresourceLayers& dstSubresourceLayers, Edge dstEdge,
+                      const VkImageSubresourceLayers& srcSubresourceLayers, Edge srcEdge);
+
+	std::mutex blitMutex;
+	RoutineCacheT<State, BlitFunction::CFunctionType> blitCache; // guarded by blitMutex
+	std::mutex cornerUpdateMutex;
+	RoutineCacheT<State, CornerUpdateFunction::CFunctionType> cornerUpdateCache; // guarded by cornerUpdateMutex
+};
+
+}  // namespace sw
 
 #endif   // sw_Blitter_hpp

diff --git a/src/Device/Clipper.cpp b/src/Device/Clipper.cpp
index 43fa72b..d36de8c 100644
--- a/src/Device/Clipper.cpp
+++ b/src/Device/Clipper.cpp

@@ -17,278 +17,280 @@
 #include "Polygon.hpp"
 #include "Renderer.hpp"
 
-namespace
+namespace {
+
+inline void clipEdge(sw::float4 &Vo, const sw::float4 &Vi, const sw::float4 &Vj, float di, float dj)
 {
-	inline void clipEdge(sw::float4 &Vo, const sw::float4 &Vi, const sw::float4 &Vj, float di, float dj)
-	{
-		float D = 1.0f / (dj - di);
+	float D = 1.0f / (dj - di);
 
-		Vo.x = (dj * Vi.x - di * Vj.x) * D;
-		Vo.y = (dj * Vi.y - di * Vj.y) * D;
-		Vo.z = (dj * Vi.z - di * Vj.z) * D;
-		Vo.w = (dj * Vi.w - di * Vj.w) * D;
-	}
-
-	void clipNear(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->z;
-			float dj = V[j]->z;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipFar(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->z;
-			float dj = V[j]->w - V[j]->z;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipLeft(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w + V[i]->x;
-			float dj = V[j]->w + V[j]->x;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipRight(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->x;
-			float dj = V[j]->w - V[j]->x;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipTop(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w - V[i]->y;
-			float dj = V[j]->w - V[j]->y;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
-
-	void clipBottom(sw::Polygon &polygon)
-	{
-		const sw::float4 **V = polygon.P[polygon.i];
-		const sw::float4 **T = polygon.P[polygon.i + 1];
-
-		int t = 0;
-
-		for(int i = 0; i < polygon.n; i++)
-		{
-			int j = i == polygon.n - 1 ? 0 : i + 1;
-
-			float di = V[i]->w + V[i]->y;
-			float dj = V[j]->w + V[j]->y;
-
-			if(di >= 0)
-			{
-				T[t++] = V[i];
-
-				if(dj < 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-			else
-			{
-				if(dj > 0)
-				{
-					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
-					T[t++] = &polygon.B[polygon.b++];
-				}
-			}
-		}
-
-		polygon.n = t;
-		polygon.i += 1;
-	}
+	Vo.x = (dj * Vi.x - di * Vj.x) * D;
+	Vo.y = (dj * Vi.y - di * Vj.y) * D;
+	Vo.z = (dj * Vi.z - di * Vj.z) * D;
+	Vo.w = (dj * Vi.w - di * Vj.w) * D;
 }
 
-namespace sw
+void clipNear(sw::Polygon &polygon)
 {
-	unsigned int Clipper::ComputeClipFlags(const float4 &v)
-	{
-		return ((v.x > v.w)     ? CLIP_RIGHT  : 0) |
-		       ((v.y > v.w)     ? CLIP_TOP    : 0) |
-		       ((v.z > v.w)     ? CLIP_FAR    : 0) |
-		       ((v.x < -v.w)    ? CLIP_LEFT   : 0) |
-		       ((v.y < -v.w)    ? CLIP_BOTTOM : 0) |
-		       ((v.z < 0)       ? CLIP_NEAR   : 0) |
-		       Clipper::CLIP_FINITE;   // FIXME: xyz finite
-	}
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
 
-	bool Clipper::Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
 	{
-		if(clipFlagsOr & CLIP_FRUSTUM)
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->z;
+		float dj = V[j]->z;
+
+		if(di >= 0)
 		{
-			if(clipFlagsOr & CLIP_NEAR)   clipNear(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_FAR)    clipFar(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_LEFT)   clipLeft(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_RIGHT)  clipRight(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_TOP)    clipTop(polygon);
-			if(polygon.n >= 3) {
-			if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
-			}}}}}
-		}
+			T[t++] = V[i];
 
-		return polygon.n >= 3;
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
 	}
+
+	polygon.n = t;
+	polygon.i += 1;
 }
+
+void clipFar(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->z;
+		float dj = V[j]->w - V[j]->z;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipLeft(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w + V[i]->x;
+		float dj = V[j]->w + V[j]->x;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipRight(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->x;
+		float dj = V[j]->w - V[j]->x;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipTop(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w - V[i]->y;
+		float dj = V[j]->w - V[j]->y;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+void clipBottom(sw::Polygon &polygon)
+{
+	const sw::float4 **V = polygon.P[polygon.i];
+	const sw::float4 **T = polygon.P[polygon.i + 1];
+
+	int t = 0;
+
+	for(int i = 0; i < polygon.n; i++)
+	{
+		int j = i == polygon.n - 1 ? 0 : i + 1;
+
+		float di = V[i]->w + V[i]->y;
+		float dj = V[j]->w + V[j]->y;
+
+		if(di >= 0)
+		{
+			T[t++] = V[i];
+
+			if(dj < 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+		else
+		{
+			if(dj > 0)
+			{
+				clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+				T[t++] = &polygon.B[polygon.b++];
+			}
+		}
+	}
+
+	polygon.n = t;
+	polygon.i += 1;
+}
+
+}  // anonymous namespace
+
+namespace sw {
+
+unsigned int Clipper::ComputeClipFlags(const float4 &v)
+{
+	return ((v.x > v.w)     ? CLIP_RIGHT  : 0) |
+	       ((v.y > v.w)     ? CLIP_TOP    : 0) |
+	       ((v.z > v.w)     ? CLIP_FAR    : 0) |
+	       ((v.x < -v.w)    ? CLIP_LEFT   : 0) |
+	       ((v.y < -v.w)    ? CLIP_BOTTOM : 0) |
+	       ((v.z < 0)       ? CLIP_NEAR   : 0) |
+	       Clipper::CLIP_FINITE;   // FIXME: xyz finite
+}
+
+bool Clipper::Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+{
+	if(clipFlagsOr & CLIP_FRUSTUM)
+	{
+		if(clipFlagsOr & CLIP_NEAR)   clipNear(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_FAR)    clipFar(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_LEFT)   clipLeft(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_RIGHT)  clipRight(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_TOP)    clipTop(polygon);
+		if(polygon.n >= 3) {
+		if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
+		}}}}}
+	}
+
+	return polygon.n >= 3;
+}
+
+}  // namespace sw

diff --git a/src/Device/Clipper.hpp b/src/Device/Clipper.hpp
index 0d111fd..4992a57 100644
--- a/src/Device/Clipper.hpp
+++ b/src/Device/Clipper.hpp

@@ -15,32 +15,33 @@
 #ifndef sw_Clipper_hpp
 #define sw_Clipper_hpp
 
-namespace sw
+namespace sw {
+
+struct DrawCall;
+struct Polygon;
+struct float4;
+
+struct Clipper
 {
-	struct DrawCall;
-	struct Polygon;
-	struct float4;
-
-	struct Clipper
+	enum ClipFlags
 	{
-		enum ClipFlags
-		{
-			// Indicates the vertex is outside the respective frustum plane
-			CLIP_RIGHT  = 1 << 0,
-			CLIP_TOP    = 1 << 1,
-			CLIP_FAR    = 1 << 2,
-			CLIP_LEFT   = 1 << 3,
-			CLIP_BOTTOM = 1 << 4,
-			CLIP_NEAR   = 1 << 5,
+		// Indicates the vertex is outside the respective frustum plane
+		CLIP_RIGHT  = 1 << 0,
+		CLIP_TOP    = 1 << 1,
+		CLIP_FAR    = 1 << 2,
+		CLIP_LEFT   = 1 << 3,
+		CLIP_BOTTOM = 1 << 4,
+		CLIP_NEAR   = 1 << 5,
 
-			CLIP_FRUSTUM = 0x003F,
+		CLIP_FRUSTUM = 0x003F,
 
-			CLIP_FINITE = 1 << 7,   // All position coordinates are finite
-		};
-
-		static unsigned int ComputeClipFlags(const float4 &v);
-		static bool Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+		CLIP_FINITE = 1 << 7,   // All position coordinates are finite
 	};
-}
+
+	static unsigned int ComputeClipFlags(const float4 &v);
+	static bool Clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+};
+
+}  // namespace sw
 
 #endif   // sw_Clipper_hpp

diff --git a/src/Device/Color.cpp b/src/Device/Color.cpp
index 9ad6767..d028028 100644
--- a/src/Device/Color.cpp
+++ b/src/Device/Color.cpp

@@ -14,6 +14,5 @@
 
 #include "Color.hpp"
 
-namespace sw
-{
-}
+namespace sw {
+}  // namespace sw

diff --git a/src/Device/Color.hpp b/src/Device/Color.hpp
index 0e6fc27..2b27e86 100644
--- a/src/Device/Color.hpp
+++ b/src/Device/Color.hpp

@@ -18,455 +18,456 @@
 #include "System/Types.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+template<class T>
+struct Color
 {
-	template<class T>
-	struct Color
-	{
-		Color();
+	Color();
+
+	Color(const Color<byte> &c);
+	Color(const Color<short> &c);
+	Color(const Color<float> &c);
 	
-		Color(const Color<byte> &c);
-		Color(const Color<short> &c);
-		Color(const Color<float> &c);
-		
-		Color(int c);
-		Color(unsigned short c);
-		Color(unsigned long c);
-		Color(unsigned int c);
-		
-		Color(T r, T g, T b, T a = 1);
+	Color(int c);
+	Color(unsigned short c);
+	Color(unsigned long c);
+	Color(unsigned int c);
+	
+	Color(T r, T g, T b, T a = 1);
 
-		operator unsigned int() const;
+	operator unsigned int() const;
 
-		T &operator[](int i);
-		const T &operator[](int i) const;
+	T &operator[](int i);
+	const T &operator[](int i) const;
 
-		Color<T> operator+() const;
-		Color<T> operator-() const;
+	Color<T> operator+() const;
+	Color<T> operator-() const;
 
-		Color<T>& operator=(const Color<T>& c);
+	Color<T>& operator=(const Color<T>& c);
 
-		Color<T> &operator+=(const Color<T> &c);
-		Color<T> &operator*=(float l);
+	Color<T> &operator+=(const Color<T> &c);
+	Color<T> &operator*=(float l);
 
-		static Color<T> gradient(const Color<T> &c1, const Color<T>  &c2, float d);
-		static Color<T> shade(const Color<T> &c1, const Color<T>  &c2, float d);
+	static Color<T> gradient(const Color<T> &c1, const Color<T>  &c2, float d);
+	static Color<T> shade(const Color<T> &c1, const Color<T>  &c2, float d);
 
-		template<class S>
-		friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
-		template<class S>
-		friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
 
-		template<class S>
-		friend Color<S> operator*(float l, const Color<S> &c);
-		template<class S>
-		friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
-		template<class S>
-		friend Color<S> operator/(const Color<S> &c, float l);
+	template<class S>
+	friend Color<S> operator*(float l, const Color<S> &c);
+	template<class S>
+	friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
+	template<class S>
+	friend Color<S> operator/(const Color<S> &c, float l);
 
-		T r;
-		T g;
-		T b;
-		T a;
-	};
+	T r;
+	T g;
+	T b;
+	T a;
+};
 }
 
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+template<class T>
+inline Color<T>::Color()
 {
-	template<class T>
-	inline Color<T>::Color()
-	{
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<byte> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<short> &c)
-	{
-		r = static_cast<byte>(clamp(c.r >> 4, 0, 255));
-		g = static_cast<byte>(clamp(c.g >> 4, 0, 255));
-		b = static_cast<byte>(clamp(c.b >> 4, 0, 255));
-		a = static_cast<byte>(clamp(c.a >> 4, 0, 255));
-	}
-
-	template<>
-	inline Color<byte>::Color(const Color<float> &c)
-	{
-		r = static_cast<byte>(ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f)));
-		g = static_cast<byte>(ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f)));
-		b = static_cast<byte>(ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f)));
-		a = static_cast<byte>(ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f)));
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<short> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<byte> &c)
-	{
-		r = c.r << 4;
-		g = c.g << 4;
-		b = c.b << 4;
-		a = c.a << 4;
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<float> &c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-	}
-
-	template<>
-	inline Color<short>::Color(const Color<float> &c)
-	{
-		r = static_cast<short>(iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f)));
-		g = static_cast<short>(iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f)));
-		b = static_cast<short>(iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f)));
-		a = static_cast<short>(iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f)));
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<byte> &c)
-	{
-		r = c.r / 255.0f;
-		g = c.g / 255.0f;
-		b = c.b / 255.0f;
-		a = c.a / 255.0f;
-	}
-
-	template<>
-	inline Color<float>::Color(const Color<short> &c)
-	{
-		r = c.r / 4095.0f;
-		g = c.g / 4095.0f;
-		b = c.b / 4095.0f;
-		a = c.a / 4095.0f;
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned short c)
-	{
-		r = (float)(c & 0xF800) / (float)0xF800;
-		g = (float)(c & 0x07E0) / (float)0x07E0;
-		b = (float)(c & 0x001F) / (float)0x001F;
-		a = 1;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned short c)
-	{
-		// 4.12 fixed-point format
-		r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
-		g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
-		b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
-		a = 0x1000;
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned short c)
-	{
-		r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
-		g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
-		b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
-		a = 0xFF;
-	}
-
-	template<>
-	inline Color<float>::Color(int c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(int c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(int c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned int c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned int c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned int c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<>
-	inline Color<float>::Color(unsigned long c)
-	{
-		const float d = 1.0f / 255.0f;
-
-		r = (float)((c & 0x00FF0000) >> 16) * d;
-		g = (float)((c & 0x0000FF00) >> 8) * d;
-		b = (float)((c & 0x000000FF) >> 0) * d;
-		a = (float)((c & 0xFF000000) >> 24) * d;
-	}
-
-	template<>
-	inline Color<short>::Color(unsigned long c)
-	{
-		// 4.12 fixed-point format
-		r = (short)((c & 0x00FF0000) >> 12);
-		g = (short)((c & 0x0000FF00) >> 4);
-		b = (short)((c & 0x000000FF) << 4);
-		a = (short)((c & 0xFF000000) >> 20);
-	}
-
-	template<>
-	inline Color<byte>::Color(unsigned long c)
-	{
-		r = (byte)((c & 0x00FF0000) >> 16);
-		g = (byte)((c & 0x0000FF00) >> 8);
-		b = (byte)((c & 0x000000FF) >> 0);
-		a = (byte)((c & 0xFF000000) >> 24);
-	}
-
-	template<class T>
-	inline Color<T>::Color(T r_, T g_, T b_, T a_)
-	{
-		r = r_;
-		g = g_;
-		b = b_;
-		a = a_;
-	}
-
-	template<>
-	inline Color<float>::operator unsigned int() const
-	{
-		return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
-		       ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
-		       ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
-		       ((unsigned int)min(a * 255.0f, 255.0f) << 24);
-	}
-
-	template<>
-	inline Color<short>::operator unsigned int() const
-	{
-		return ((unsigned int)min(b >> 4, 255) << 0) |
-		       ((unsigned int)min(g >> 4, 255) << 8) |
-		       ((unsigned int)min(r >> 4, 255) << 16) |
-		       ((unsigned int)min(a >> 4, 255) << 24);
-	}
-
-	template<>
-	inline Color<byte>::operator unsigned int() const
-	{
-		return (b << 0) +
-		       (g << 8) +
-		       (r << 16) +
-			   (a << 24);
-	}
-
-	template<class T>
-	inline T &Color<T>::operator[](int i)
-	{
-		return (&r)[i];
-	}
-
-	template<class T>
-	inline const T &Color<T>::operator[](int i) const
-	{
-		return (&r)[i];
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::operator+() const
-	{
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::operator-() const
-	{
-		return Color(-r, -g, -b, -a);
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator=(const Color& c)
-	{
-		r = c.r;
-		g = c.g;
-		b = c.b;
-		a = c.a;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator+=(const Color &c)
-	{
-		r += c.r;
-		g += c.g;
-		b += c.b;
-		a += c.a;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> &Color<T>::operator*=(float l)
-	{
-		*this = l * *this;
-
-		return *this;
-	}
-
-	template<class T>
-	inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
-	{
-		return Color<T>(c1.r + c2.r,
-		                c1.g + c2.g,
-		                c1.b + c2.b,
-		                c1.a + c2.a);	
-	}
-
-	template<class T>
-	inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
-	{
-		return Color<T>(c1.r - c2.r,
-		                c1.g - c2.g,
-		                c1.b - c2.b,
-		                c1.a - c2.a);	
-	}
-
-	template<class T>
-	inline Color<T> operator*(float l, const Color<T> &c)
-	{
-		T r = (T)(l * c.r);
-		T g = (T)(l * c.g);
-		T b = (T)(l * c.b);
-		T a = (T)(l * c.a);
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
-	{
-		T r = c1.r * c2.r;
-		T g = c1.g * c2.g;
-		T b = c1.b * c2.b;
-		T a = c1.a * c2.a;
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<>
-	inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
-	{
-		short r = c1.r * c2.r >> 12;
-		short g = c1.g * c2.g >> 12;
-		short b = c1.b * c2.b >> 12;
-		short a = c1.a * c2.a >> 12;
-
-		return Color<short>(r, g, b, a);
-	}
-
-	template<>
-	inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
-	{
-		byte r = c1.r * c2.r >> 8;
-		byte g = c1.g * c2.g >> 8;
-		byte b = c1.b * c2.b >> 8;
-		byte a = c1.a * c2.a >> 8;
-
-		return Color<byte>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> operator/(const Color<T> &c, float l)
-	{
-		l = 1.0f / l; 
-
-		T r = (T)(l * c.r);
-		T g = (T)(l * c.g);
-		T b = (T)(l * c.b);
-		T a = (T)(l * c.a);
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
-	{
-		d = 1.0f / d; 
-
-		T r = (c2.r - c1.r) * d;
-		T g = (c2.g - c1.g) * d;
-		T b = (c2.b - c1.b) * d;
-		T a = (c2.a - c1.a) * d;
-
-		return Color<T>(r, g, b, a);
-	}
-
-	template<class T>
-	inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T>  &c2, float d)
-	{
-		T r = c1.r + (T)(d * (c2.r - c1.r));
-		T g = c1.g + (T)(d * (c2.g - c1.g));
-		T b = c1.b + (T)(d * (c2.b - c1.b));
-		T a = c1.a + (T)(d * (c2.a - c1.a));
-
-		return Color<T>(r, g, b, a);
-	}
 }
 
+template<>
+inline Color<byte>::Color(const Color<byte> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<byte>::Color(const Color<short> &c)
+{
+	r = static_cast<byte>(clamp(c.r >> 4, 0, 255));
+	g = static_cast<byte>(clamp(c.g >> 4, 0, 255));
+	b = static_cast<byte>(clamp(c.b >> 4, 0, 255));
+	a = static_cast<byte>(clamp(c.a >> 4, 0, 255));
+}
+
+template<>
+inline Color<byte>::Color(const Color<float> &c)
+{
+	r = static_cast<byte>(ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f)));
+	g = static_cast<byte>(ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f)));
+	b = static_cast<byte>(ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f)));
+	a = static_cast<byte>(ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f)));
+}
+
+template<>
+inline Color<short>::Color(const Color<short> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<short>::Color(const Color<byte> &c)
+{
+	r = c.r << 4;
+	g = c.g << 4;
+	b = c.b << 4;
+	a = c.a << 4;
+}
+
+template<>
+inline Color<float>::Color(const Color<float> &c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+}
+
+template<>
+inline Color<short>::Color(const Color<float> &c)
+{
+	r = static_cast<short>(iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f)));
+	g = static_cast<short>(iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f)));
+	b = static_cast<short>(iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f)));
+	a = static_cast<short>(iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f)));
+}
+
+template<>
+inline Color<float>::Color(const Color<byte> &c)
+{
+	r = c.r / 255.0f;
+	g = c.g / 255.0f;
+	b = c.b / 255.0f;
+	a = c.a / 255.0f;
+}
+
+template<>
+inline Color<float>::Color(const Color<short> &c)
+{
+	r = c.r / 4095.0f;
+	g = c.g / 4095.0f;
+	b = c.b / 4095.0f;
+	a = c.a / 4095.0f;
+}
+
+template<>
+inline Color<float>::Color(unsigned short c)
+{
+	r = (float)(c & 0xF800) / (float)0xF800;
+	g = (float)(c & 0x07E0) / (float)0x07E0;
+	b = (float)(c & 0x001F) / (float)0x001F;
+	a = 1;
+}
+
+template<>
+inline Color<short>::Color(unsigned short c)
+{
+	// 4.12 fixed-point format
+	r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
+	g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
+	b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
+	a = 0x1000;
+}
+
+template<>
+inline Color<byte>::Color(unsigned short c)
+{
+	r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
+	g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
+	b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
+	a = 0xFF;
+}
+
+template<>
+inline Color<float>::Color(int c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(int c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(int c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<>
+inline Color<float>::Color(unsigned int c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(unsigned int c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(unsigned int c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<>
+inline Color<float>::Color(unsigned long c)
+{
+	const float d = 1.0f / 255.0f;
+
+	r = (float)((c & 0x00FF0000) >> 16) * d;
+	g = (float)((c & 0x0000FF00) >> 8) * d;
+	b = (float)((c & 0x000000FF) >> 0) * d;
+	a = (float)((c & 0xFF000000) >> 24) * d;
+}
+
+template<>
+inline Color<short>::Color(unsigned long c)
+{
+	// 4.12 fixed-point format
+	r = (short)((c & 0x00FF0000) >> 12);
+	g = (short)((c & 0x0000FF00) >> 4);
+	b = (short)((c & 0x000000FF) << 4);
+	a = (short)((c & 0xFF000000) >> 20);
+}
+
+template<>
+inline Color<byte>::Color(unsigned long c)
+{
+	r = (byte)((c & 0x00FF0000) >> 16);
+	g = (byte)((c & 0x0000FF00) >> 8);
+	b = (byte)((c & 0x000000FF) >> 0);
+	a = (byte)((c & 0xFF000000) >> 24);
+}
+
+template<class T>
+inline Color<T>::Color(T r_, T g_, T b_, T a_)
+{
+	r = r_;
+	g = g_;
+	b = b_;
+	a = a_;
+}
+
+template<>
+inline Color<float>::operator unsigned int() const
+{
+	return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
+	       ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
+	       ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
+	       ((unsigned int)min(a * 255.0f, 255.0f) << 24);
+}
+
+template<>
+inline Color<short>::operator unsigned int() const
+{
+	return ((unsigned int)min(b >> 4, 255) << 0) |
+	       ((unsigned int)min(g >> 4, 255) << 8) |
+	       ((unsigned int)min(r >> 4, 255) << 16) |
+	       ((unsigned int)min(a >> 4, 255) << 24);
+}
+
+template<>
+inline Color<byte>::operator unsigned int() const
+{
+	return (b << 0) +
+	       (g << 8) +
+	       (r << 16) +
+		   (a << 24);
+}
+
+template<class T>
+inline T &Color<T>::operator[](int i)
+{
+	return (&r)[i];
+}
+
+template<class T>
+inline const T &Color<T>::operator[](int i) const
+{
+	return (&r)[i];
+}
+
+template<class T>
+inline Color<T> Color<T>::operator+() const
+{
+	return *this;
+}
+
+template<class T>
+inline Color<T> Color<T>::operator-() const
+{
+	return Color(-r, -g, -b, -a);
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator=(const Color& c)
+{
+	r = c.r;
+	g = c.g;
+	b = c.b;
+	a = c.a;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator+=(const Color &c)
+{
+	r += c.r;
+	g += c.g;
+	b += c.b;
+	a += c.a;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> &Color<T>::operator*=(float l)
+{
+	*this = l * *this;
+
+	return *this;
+}
+
+template<class T>
+inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
+{
+	return Color<T>(c1.r + c2.r,
+	                c1.g + c2.g,
+	                c1.b + c2.b,
+	                c1.a + c2.a);	
+}
+
+template<class T>
+inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
+{
+	return Color<T>(c1.r - c2.r,
+	                c1.g - c2.g,
+	                c1.b - c2.b,
+	                c1.a - c2.a);	
+}
+
+template<class T>
+inline Color<T> operator*(float l, const Color<T> &c)
+{
+	T r = (T)(l * c.r);
+	T g = (T)(l * c.g);
+	T b = (T)(l * c.b);
+	T a = (T)(l * c.a);
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
+{
+	T r = c1.r * c2.r;
+	T g = c1.g * c2.g;
+	T b = c1.b * c2.b;
+	T a = c1.a * c2.a;
+
+	return Color<T>(r, g, b, a);
+}
+
+template<>
+inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
+{
+	short r = c1.r * c2.r >> 12;
+	short g = c1.g * c2.g >> 12;
+	short b = c1.b * c2.b >> 12;
+	short a = c1.a * c2.a >> 12;
+
+	return Color<short>(r, g, b, a);
+}
+
+template<>
+inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
+{
+	byte r = c1.r * c2.r >> 8;
+	byte g = c1.g * c2.g >> 8;
+	byte b = c1.b * c2.b >> 8;
+	byte a = c1.a * c2.a >> 8;
+
+	return Color<byte>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> operator/(const Color<T> &c, float l)
+{
+	l = 1.0f / l; 
+
+	T r = (T)(l * c.r);
+	T g = (T)(l * c.g);
+	T b = (T)(l * c.b);
+	T a = (T)(l * c.a);
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
+{
+	d = 1.0f / d; 
+
+	T r = (c2.r - c1.r) * d;
+	T g = (c2.g - c1.g) * d;
+	T b = (c2.b - c1.b) * d;
+	T a = (c2.a - c1.a) * d;
+
+	return Color<T>(r, g, b, a);
+}
+
+template<class T>
+inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T>  &c2, float d)
+{
+	T r = c1.r + (T)(d * (c2.r - c1.r));
+	T g = c1.g + (T)(d * (c2.g - c1.g));
+	T b = c1.b + (T)(d * (c2.b - c1.b));
+	T a = c1.a + (T)(d * (c2.a - c1.a));
+
+	return Color<T>(r, g, b, a);
+}
+
+}  // namespace sw
+
 #endif   // sw_Color_hpp

diff --git a/src/Device/Config.cpp b/src/Device/Config.cpp
index 6eb61ab..5a2de75 100644
--- a/src/Device/Config.cpp
+++ b/src/Device/Config.cpp

@@ -16,37 +16,38 @@
 
 #include "System/Timer.hpp"
 
-namespace sw
+namespace sw {
+
+Profiler profiler;
+
+Profiler::Profiler()
 {
-	Profiler profiler;
+	reset();
+}
 
-	Profiler::Profiler()
-	{
-		reset();
-	}
+void Profiler::reset()
+{
+	framesSec = 0;
+	framesTotal = 0;
+	FPS = 0;
+}
 
-	void Profiler::reset()
+void Profiler::nextFrame()
+{
+	static double fpsTime = sw::Timer::seconds();
+
+	double time = sw::Timer::seconds();
+	double delta = time - fpsTime;
+	framesSec++;
+
+	if(delta > 1.0)
 	{
+		FPS = framesSec / delta;
+
+		fpsTime = time;
+		framesTotal += framesSec;
 		framesSec = 0;
-		framesTotal = 0;
-		FPS = 0;
 	}
+}
 
-	void Profiler::nextFrame()
-	{
-		static double fpsTime = sw::Timer::seconds();
-
-		double time = sw::Timer::seconds();
-		double delta = time - fpsTime;
-		framesSec++;
-
-		if(delta > 1.0)
-		{
-			FPS = framesSec / delta;
-
-			fpsTime = time;
-			framesTotal += framesSec;
-			framesSec = 0;
-		}
-	}
-}
\ No newline at end of file
+}  // namespace sw
\ No newline at end of file

diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
index e1e0235..7584f07 100644
--- a/src/Device/Config.hpp
+++ b/src/Device/Config.hpp

@@ -17,49 +17,50 @@
 
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+enum
 {
-	enum
-	{
-		PERF_PIXEL,
-		PERF_PIPE,
-		PERF_INTERP,
-		PERF_SHADER,
-		PERF_TEX,
-		PERF_ROP,
+	PERF_PIXEL,
+	PERF_PIPE,
+	PERF_INTERP,
+	PERF_SHADER,
+	PERF_TEX,
+	PERF_ROP,
 
-		PERF_TIMERS
-	};
+	PERF_TIMERS
+};
 
-	struct Profiler
-	{
-		Profiler();
+struct Profiler
+{
+	Profiler();
 
-		void reset();
-		void nextFrame();
+	void reset();
+	void nextFrame();
 
-		int framesSec;
-		int framesTotal;
-		double FPS;
-	};
+	int framesSec;
+	int framesTotal;
+	double FPS;
+};
 
-	extern Profiler profiler;
+extern Profiler profiler;
 
-	enum
-	{
-		OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
-		MIPMAP_LEVELS = 14,
-		MAX_UNIFORM_BLOCK_SIZE = 16384,
-		MAX_CLIP_DISTANCES = 8,
-		MAX_CULL_DISTANCES = 8,
-		MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
-		MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
-		MIN_TEXEL_OFFSET = -8,
-		MAX_TEXEL_OFFSET = 7,
-		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
-		RENDERTARGETS = 8,
-		MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
-	};
-}
+enum
+{
+	OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
+	MIPMAP_LEVELS = 14,
+	MAX_UNIFORM_BLOCK_SIZE = 16384,
+	MAX_CLIP_DISTANCES = 8,
+	MAX_CULL_DISTANCES = 8,
+	MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
+	MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
+	MIN_TEXEL_OFFSET = -8,
+	MAX_TEXEL_OFFSET = 7,
+	MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
+	RENDERTARGETS = 8,
+	MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
+};
+
+}  // namespace sw
 
 #endif   // sw_Config_hpp

diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
index 49505c9..e41ce74 100644
--- a/src/Device/Context.cpp
+++ b/src/Device/Context.cpp

@@ -22,552 +22,553 @@
 
 #include <string.h>
 
-namespace sw
+namespace sw {
+
+Context::Context()
 {
-	Context::Context()
+	init();
+}
+
+bool Context::isDrawPoint(bool polygonModeAware) const
+{
+	switch(topology)
 	{
-		init();
-	}
-
-	bool Context::isDrawPoint(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-			return true;
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_POINT) : false;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	bool Context::isDrawLine(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return true;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_LINE) : false;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	bool Context::isDrawTriangle(bool polygonModeAware) const
-	{
-		switch(topology)
-		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-			return false;
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-			return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_FILL) : true;
-		default:
-			UNIMPLEMENTED("topology %d", int(topology));
-		}
-		return false;
-	}
-
-	void Context::init()
-	{
-		for(int i = 0; i < RENDERTARGETS; ++i)
-		{
-			renderTarget[i] = nullptr;
-		}
-
-		depthBuffer = nullptr;
-		stencilBuffer = nullptr;
-
-		stencilEnable = false;
-		frontStencil = {};
-		backStencil = {};
-
-		robustBufferAccess = false;
-
-		rasterizerDiscard = false;
-
-		depthCompareMode = VK_COMPARE_OP_LESS;
-		depthBoundsTestEnable = false;
-		depthBufferEnable = false;
-		depthWriteEnable = false;
-
-		cullMode = VK_CULL_MODE_FRONT_BIT;
-		frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
-		provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
-		lineRasterizationMode = VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
-
-		depthBias = 0.0f;
-		slopeDepthBias = 0.0f;
-
-		for(int i = 0; i < RENDERTARGETS; i++)
-		{
-			colorWriteMask[i] = 0x0000000F;
-		}
-
-		pipelineLayout = nullptr;
-
-		pixelShader = nullptr;
-		vertexShader = nullptr;
-
-		occlusionEnabled = false;
-
-		lineWidth = 1.0f;
-
-		sampleMask = 0xFFFFFFFF;
-		alphaToCoverage = false;
-	}
-
-	bool Context::depthWriteActive() const
-	{
-		if(!depthBufferActive()) return false;
-
-		return depthWriteEnable;
-	}
-
-	bool Context::depthBufferActive() const
-	{
-		return depthBuffer && depthBufferEnable;
-	}
-
-	bool Context::stencilActive() const
-	{
-		return stencilBuffer && stencilEnable;
-	}
-
-	void Context::setBlendState(int index, BlendState state)
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		blendState[index] = state;
-	}
-
-	BlendState Context::getBlendState(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		BlendState activeBlendState;
-		activeBlendState.alphaBlendEnable = alphaBlendActive(index);
-		activeBlendState.sourceBlendFactor = sourceBlendFactor(index);
-		activeBlendState.destBlendFactor = destBlendFactor(index);
-		activeBlendState.blendOperation = blendOperation(index);
-		activeBlendState.sourceBlendFactorAlpha = sourceBlendFactorAlpha(index);
-		activeBlendState.destBlendFactorAlpha = destBlendFactorAlpha(index);
-		activeBlendState.blendOperationAlpha = blendOperationAlpha(index);
-		return activeBlendState;
-	}
-
-	bool Context::alphaBlendActive(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable)
-		{
-			return false;
-		}
-
-		if(!colorUsed())
-		{
-			return false;
-		}
-
-		bool colorBlend = !(blendOperation(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE);
-		bool alphaBlend = !(blendOperationAlpha(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE);
-
-		return colorBlend || alphaBlend;
-	}
-
-	VkBlendFactor Context::sourceBlendFactor(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
-
-		switch(blendState[index].blendOperation)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].sourceBlendFactor;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].sourceBlendFactor;
-	}
-
-	VkBlendFactor Context::destBlendFactor(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
-
-		switch(blendState[index].blendOperation)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].destBlendFactor;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].destBlendFactor;
-	}
-
-	bool Context::allTargetsColorClamp() const
-	{
-		// TODO: remove all of this and support VkPhysicalDeviceFeatures::independentBlend instead
-		for (int i = 0; i < RENDERTARGETS; i++)
-		{
-			if (renderTarget[i] && renderTarget[i]->getFormat().isFloatFormat())
-			{
-				return false;
-			}
-		}
-
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
 		return true;
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_POINT) : false;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+bool Context::isDrawLine(bool polygonModeAware) const
+{
+	switch(topology)
+	{
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return true;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_LINE) : false;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+bool Context::isDrawTriangle(bool polygonModeAware) const
+{
+	switch(topology)
+	{
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+		return false;
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+		return polygonModeAware ? (polygonMode == VK_POLYGON_MODE_FILL) : true;
+	default:
+		UNIMPLEMENTED("topology %d", int(topology));
+	}
+	return false;
+}
+
+void Context::init()
+{
+	for(int i = 0; i < RENDERTARGETS; ++i)
+	{
+		renderTarget[i] = nullptr;
 	}
 
-	VkBlendOp Context::blendOperation(int index) const
+	depthBuffer = nullptr;
+	stencilBuffer = nullptr;
+
+	stencilEnable = false;
+	frontStencil = {};
+	backStencil = {};
+
+	robustBufferAccess = false;
+
+	rasterizerDiscard = false;
+
+	depthCompareMode = VK_COMPARE_OP_LESS;
+	depthBoundsTestEnable = false;
+	depthBufferEnable = false;
+	depthWriteEnable = false;
+
+	cullMode = VK_CULL_MODE_FRONT_BIT;
+	frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+	provokingVertexMode = VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
+	lineRasterizationMode = VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
+
+	depthBias = 0.0f;
+	slopeDepthBias = 0.0f;
+
+	for(int i = 0; i < RENDERTARGETS; i++)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
+		colorWriteMask[i] = 0x0000000F;
+	}
 
-		if(!blendState[index].alphaBlendEnable) return VK_BLEND_OP_SRC_EXT;
+	pipelineLayout = nullptr;
 
-		switch(blendState[index].blendOperation)
+	pixelShader = nullptr;
+	vertexShader = nullptr;
+
+	occlusionEnabled = false;
+
+	lineWidth = 1.0f;
+
+	sampleMask = 0xFFFFFFFF;
+	alphaToCoverage = false;
+}
+
+bool Context::depthWriteActive() const
+{
+	if(!depthBufferActive()) return false;
+
+	return depthWriteEnable;
+}
+
+bool Context::depthBufferActive() const
+{
+	return depthBuffer && depthBufferEnable;
+}
+
+bool Context::stencilActive() const
+{
+	return stencilBuffer && stencilEnable;
+}
+
+void Context::setBlendState(int index, BlendState state)
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	blendState[index] = state;
+}
+
+BlendState Context::getBlendState(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	BlendState activeBlendState;
+	activeBlendState.alphaBlendEnable = alphaBlendActive(index);
+	activeBlendState.sourceBlendFactor = sourceBlendFactor(index);
+	activeBlendState.destBlendFactor = destBlendFactor(index);
+	activeBlendState.blendOperation = blendOperation(index);
+	activeBlendState.sourceBlendFactorAlpha = sourceBlendFactorAlpha(index);
+	activeBlendState.destBlendFactorAlpha = destBlendFactorAlpha(index);
+	activeBlendState.blendOperationAlpha = blendOperationAlpha(index);
+	return activeBlendState;
+}
+
+bool Context::alphaBlendActive(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable)
+	{
+		return false;
+	}
+
+	if(!colorUsed())
+	{
+		return false;
+	}
+
+	bool colorBlend = !(blendOperation(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE);
+	bool alphaBlend = !(blendOperationAlpha(index) == VK_BLEND_OP_SRC_EXT && sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE);
+
+	return colorBlend || alphaBlend;
+}
+
+VkBlendFactor Context::sourceBlendFactor(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
+
+	switch(blendState[index].blendOperation)
+	{
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].sourceBlendFactor;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].sourceBlendFactor;
+}
+
+VkBlendFactor Context::destBlendFactor(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_FACTOR_ONE;
+
+	switch(blendState[index].blendOperation)
+	{
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].destBlendFactor;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].destBlendFactor;
+}
+
+bool Context::allTargetsColorClamp() const
+{
+	// TODO: remove all of this and support VkPhysicalDeviceFeatures::independentBlend instead
+	for (int i = 0; i < RENDERTARGETS; i++)
+	{
+		if (renderTarget[i] && renderTarget[i]->getFormat().isFloatFormat())
 		{
-		case VK_BLEND_OP_ADD:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
-			}
-		case VK_BLEND_OP_SUBTRACT:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-			{
-				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-			else
-			{
-				if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_OP_MIN;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_OP_MAX;
-		default:
-			ASSERT(false);
+			return false;
 		}
-
-		return blendState[index].blendOperation;
 	}
 
-	VkBlendFactor Context::sourceBlendFactorAlpha(int index) const
+	return true;
+}
+
+VkBlendOp Context::blendOperation(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!blendState[index].alphaBlendEnable) return VK_BLEND_OP_SRC_EXT;
+
+	switch(blendState[index].blendOperation)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
+	case VK_BLEND_OP_ADD:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].sourceBlendFactorAlpha;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].sourceBlendFactorAlpha;
-	}
-
-	VkBlendFactor Context::destBlendFactorAlpha(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
-		{
-		case VK_BLEND_OP_ADD:
-		case VK_BLEND_OP_SUBTRACT:
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			return blendState[index].destBlendFactorAlpha;
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_FACTOR_ONE;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_FACTOR_ONE;
-		default:
-			ASSERT(false);
-		}
-
-		return blendState[index].destBlendFactorAlpha;
-	}
-
-	VkBlendOp Context::blendOperationAlpha(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		switch (blendState[index].blendOperationAlpha)
-		{
-		case VK_BLEND_OP_ADD:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
+				return VK_BLEND_OP_ZERO_EXT;
 			}
 			else
 			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_ADD;
-				}
+				return VK_BLEND_OP_DST_EXT;
 			}
-		case VK_BLEND_OP_SUBTRACT:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-			{
-				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-			else
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_SRC_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_REVERSE_SUBTRACT:
-			if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
-				{
-					return VK_BLEND_OP_ZERO_EXT;
-				}
-				else
-				{
-					return VK_BLEND_OP_DST_EXT;
-				}
-			}
-			else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-			else
-			{
-				if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
-				{
-					return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
-				}
-				else
-				{
-					return VK_BLEND_OP_REVERSE_SUBTRACT;
-				}
-			}
-		case VK_BLEND_OP_MIN:
-			return VK_BLEND_OP_MIN;
-		case VK_BLEND_OP_MAX:
-			return VK_BLEND_OP_MAX;
-		default:
-			ASSERT(false);
 		}
-
-		return blendState[index].blendOperationAlpha;
-	}
-
-	VkFormat Context::renderTargetInternalFormat(int index) const
-	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(renderTarget[index])
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
 		{
-			return renderTarget[index]->getFormat();
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
 		}
 		else
 		{
-			return VK_FORMAT_UNDEFINED;
-		}
-	}
-
-	bool Context::colorWriteActive() const
-	{
-		for (int i = 0; i < RENDERTARGETS; i++)
-		{
-			if (colorWriteActive(i))
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
 			{
-				return true;
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
 			}
 		}
-
-		return false;
+	case VK_BLEND_OP_SUBTRACT:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+		{
+			return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+		}
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+		else
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if(sourceBlendFactor(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+		else
+		{
+			if(destBlendFactor(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_OP_MIN;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_OP_MAX;
+	default:
+		ASSERT(false);
 	}
 
-	int Context::colorWriteActive(int index) const
+	return blendState[index].blendOperation;
+}
+
+VkBlendFactor Context::sourceBlendFactorAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
 	{
-		ASSERT((index >= 0) && (index < RENDERTARGETS));
-
-		if(!renderTarget[index] || renderTarget[index]->getFormat() == VK_FORMAT_UNDEFINED)
-		{
-			return 0;
-		}
-
-		if(blendOperation(index) == VK_BLEND_OP_DST_EXT && destBlendFactor(index) == VK_BLEND_FACTOR_ONE &&
-		   (blendOperationAlpha(index) == VK_BLEND_OP_DST_EXT && destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE))
-		{
-			return 0;
-		}
-
-		return colorWriteMask[index];
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].sourceBlendFactorAlpha;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
 	}
 
-	bool Context::colorUsed() const
+	return blendState[index].sourceBlendFactorAlpha;
+}
+
+VkBlendFactor Context::destBlendFactorAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
 	{
-		return colorWriteActive() || (pixelShader && pixelShader->getModes().ContainsKill);
+	case VK_BLEND_OP_ADD:
+	case VK_BLEND_OP_SUBTRACT:
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		return blendState[index].destBlendFactorAlpha;
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_FACTOR_ONE;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_FACTOR_ONE;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].destBlendFactorAlpha;
+}
+
+VkBlendOp Context::blendOperationAlpha(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	switch (blendState[index].blendOperationAlpha)
+	{
+	case VK_BLEND_OP_ADD:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_ADD;
+			}
+		}
+	case VK_BLEND_OP_SUBTRACT:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+		{
+			return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_SRC_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_REVERSE_SUBTRACT:
+		if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO)
+			{
+				return VK_BLEND_OP_ZERO_EXT;
+			}
+			else
+			{
+				return VK_BLEND_OP_DST_EXT;
+			}
+		}
+		else if (sourceBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE)
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+		else
+		{
+			if (destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ZERO && allTargetsColorClamp())
+			{
+				return VK_BLEND_OP_ZERO_EXT;   // Negative, clamped to zero
+			}
+			else
+			{
+				return VK_BLEND_OP_REVERSE_SUBTRACT;
+			}
+		}
+	case VK_BLEND_OP_MIN:
+		return VK_BLEND_OP_MIN;
+	case VK_BLEND_OP_MAX:
+		return VK_BLEND_OP_MAX;
+	default:
+		ASSERT(false);
+	}
+
+	return blendState[index].blendOperationAlpha;
+}
+
+VkFormat Context::renderTargetInternalFormat(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(renderTarget[index])
+	{
+		return renderTarget[index]->getFormat();
+	}
+	else
+	{
+		return VK_FORMAT_UNDEFINED;
 	}
 }
+
+bool Context::colorWriteActive() const
+{
+	for (int i = 0; i < RENDERTARGETS; i++)
+	{
+		if (colorWriteActive(i))
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+int Context::colorWriteActive(int index) const
+{
+	ASSERT((index >= 0) && (index < RENDERTARGETS));
+
+	if(!renderTarget[index] || renderTarget[index]->getFormat() == VK_FORMAT_UNDEFINED)
+	{
+		return 0;
+	}
+
+	if(blendOperation(index) == VK_BLEND_OP_DST_EXT && destBlendFactor(index) == VK_BLEND_FACTOR_ONE &&
+	   (blendOperationAlpha(index) == VK_BLEND_OP_DST_EXT && destBlendFactorAlpha(index) == VK_BLEND_FACTOR_ONE))
+	{
+		return 0;
+	}
+
+	return colorWriteMask[index];
+}
+
+bool Context::colorUsed() const
+{
+	return colorWriteActive() || (pixelShader && pixelShader->getModes().ContainsKill);
+}
+
+}  // namespace sw

diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
index bb21eb7..20bc089 100644
--- a/src/Device/Context.hpp
+++ b/src/Device/Context.hpp

@@ -22,137 +22,139 @@
 #include "Stream.hpp"
 #include "System/Types.hpp"
 
-namespace vk
+namespace vk {
+
+class ImageView;
+class PipelineLayout;
+
+}  // namespace vk
+
+namespace sw {
+
+class SpirvShader;
+
+struct PushConstantStorage
 {
-	class ImageView;
-	class PipelineLayout;
-}
+	unsigned char data[vk::MAX_PUSH_CONSTANT_SIZE];
+};
 
-namespace sw
+struct BlendState : Memset<BlendState>
 {
-	class SpirvShader;
+	BlendState() : Memset(this, 0) {}
 
-	struct PushConstantStorage
-	{
-		unsigned char data[vk::MAX_PUSH_CONSTANT_SIZE];
-	};
+	BlendState(bool alphaBlendEnable,
+	           VkBlendFactor sourceBlendFactor,
+	           VkBlendFactor destBlendFactor,
+	           VkBlendOp blendOperation,
+	           VkBlendFactor sourceBlendFactorAlpha,
+	           VkBlendFactor destBlendFactorAlpha,
+	           VkBlendOp blendOperationAlpha) :
+		Memset(this, 0),
+		alphaBlendEnable(alphaBlendEnable),
+		sourceBlendFactor(sourceBlendFactor),
+		destBlendFactor(destBlendFactor),
+		blendOperation(blendOperation),
+		sourceBlendFactorAlpha(sourceBlendFactorAlpha),
+		destBlendFactorAlpha(destBlendFactorAlpha),
+		blendOperationAlpha(blendOperationAlpha)
+	{}
 
-	struct BlendState : Memset<BlendState>
-	{
-		BlendState() : Memset(this, 0) {}
+	bool alphaBlendEnable;
+	VkBlendFactor sourceBlendFactor;
+	VkBlendFactor destBlendFactor;
+	VkBlendOp blendOperation;
+	VkBlendFactor sourceBlendFactorAlpha;
+	VkBlendFactor destBlendFactorAlpha;
+	VkBlendOp blendOperationAlpha;
+};
 
-		BlendState(bool alphaBlendEnable,
-		           VkBlendFactor sourceBlendFactor,
-		           VkBlendFactor destBlendFactor,
-		           VkBlendOp blendOperation,
-		           VkBlendFactor sourceBlendFactorAlpha,
-		           VkBlendFactor destBlendFactorAlpha,
-		           VkBlendOp blendOperationAlpha) :
-			Memset(this, 0),
-			alphaBlendEnable(alphaBlendEnable),
-			sourceBlendFactor(sourceBlendFactor),
-			destBlendFactor(destBlendFactor),
-			blendOperation(blendOperation),
-			sourceBlendFactorAlpha(sourceBlendFactorAlpha),
-			destBlendFactorAlpha(destBlendFactorAlpha),
-			blendOperationAlpha(blendOperationAlpha)
-		{}
+class Context
+{
+public:
+	Context();
 
-		bool alphaBlendEnable;
-		VkBlendFactor sourceBlendFactor;
-		VkBlendFactor destBlendFactor;
-		VkBlendOp blendOperation;
-		VkBlendFactor sourceBlendFactorAlpha;
-		VkBlendFactor destBlendFactorAlpha;
-		VkBlendOp blendOperationAlpha;
-	};
+	void init();
 
-	class Context
-	{
-	public:
-		Context();
+	bool isDrawPoint(bool polygonModeAware) const;
+	bool isDrawLine(bool polygonModeAware) const;
+	bool isDrawTriangle(bool polygonModeAware) const;
 
-		void init();
+	bool depthWriteActive() const;
+	bool depthBufferActive() const;
+	bool stencilActive() const;
 
-		bool isDrawPoint(bool polygonModeAware) const;
-		bool isDrawLine(bool polygonModeAware) const;
-		bool isDrawTriangle(bool polygonModeAware) const;
+	bool allTargetsColorClamp() const;
 
-		bool depthWriteActive() const;
-		bool depthBufferActive() const;
-		bool stencilActive() const;
+	void setBlendState(int index, BlendState state);
+	BlendState getBlendState(int index) const;
 
-		bool allTargetsColorClamp() const;
+	VkPrimitiveTopology topology;
+	VkProvokingVertexModeEXT provokingVertexMode;
 
-		void setBlendState(int index, BlendState state);
-		BlendState getBlendState(int index) const;
+	bool stencilEnable;
+	VkStencilOpState frontStencil;
+	VkStencilOpState backStencil;
 
-		VkPrimitiveTopology topology;
-		VkProvokingVertexModeEXT provokingVertexMode;
+	// Pixel processor states
+	VkCullModeFlags cullMode;
+	VkFrontFace frontFace;
+	VkPolygonMode polygonMode;
+	VkLineRasterizationModeEXT lineRasterizationMode;
 
-		bool stencilEnable;
-		VkStencilOpState frontStencil;
-		VkStencilOpState backStencil;
+	float depthBias;
+	float slopeDepthBias;
 
-		// Pixel processor states
-		VkCullModeFlags cullMode;
-		VkFrontFace frontFace;
-		VkPolygonMode polygonMode;
-		VkLineRasterizationModeEXT lineRasterizationMode;
+	VkFormat renderTargetInternalFormat(int index) const;
+	int colorWriteActive(int index) const;
 
-		float depthBias;
-		float slopeDepthBias;
+	vk::DescriptorSet::Bindings descriptorSets = {};
+	vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
+	Stream input[MAX_INTERFACE_COMPONENTS / 4];
+	bool robustBufferAccess;
 
-		VkFormat renderTargetInternalFormat(int index) const;
-		int colorWriteActive(int index) const;
+	vk::ImageView *renderTarget[RENDERTARGETS];
+	vk::ImageView *depthBuffer;
+	vk::ImageView *stencilBuffer;
 
-		vk::DescriptorSet::Bindings descriptorSets = {};
-		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
-		Stream input[MAX_INTERFACE_COMPONENTS / 4];
-		bool robustBufferAccess;
+	vk::PipelineLayout const *pipelineLayout;
 
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
+	// Shaders
+	const SpirvShader *pixelShader;
+	const SpirvShader *vertexShader;
 
-		vk::PipelineLayout const *pipelineLayout;
+	bool occlusionEnabled;
 
-		// Shaders
-		const SpirvShader *pixelShader;
-		const SpirvShader *vertexShader;
+	// Pixel processor states
+	bool rasterizerDiscard;
+	bool depthBoundsTestEnable;
+	bool depthBufferEnable;
+	VkCompareOp depthCompareMode;
+	bool depthWriteEnable;
 
-		bool occlusionEnabled;
+	float lineWidth;
 
-		// Pixel processor states
-		bool rasterizerDiscard;
-		bool depthBoundsTestEnable;
-		bool depthBufferEnable;
-		VkCompareOp depthCompareMode;
-		bool depthWriteEnable;
+	int colorWriteMask[RENDERTARGETS];   // RGBA
+	unsigned int sampleMask;
+	unsigned int multiSampleMask;
+	int sampleCount;
+	bool alphaToCoverage;
 
-		float lineWidth;
+private:
+	bool colorWriteActive() const;
+	bool colorUsed() const;
 
-		int colorWriteMask[RENDERTARGETS];   // RGBA
-		unsigned int sampleMask;
-		unsigned int multiSampleMask;
-		int sampleCount;
-		bool alphaToCoverage;
+	bool alphaBlendActive(int index) const;
+	VkBlendFactor sourceBlendFactor(int index) const;
+	VkBlendFactor destBlendFactor(int index) const;
+	VkBlendOp blendOperation(int index) const;
 
-	private:
-		bool colorWriteActive() const;
-		bool colorUsed() const;
+	VkBlendFactor sourceBlendFactorAlpha(int index) const;
+	VkBlendFactor destBlendFactorAlpha(int index) const;
+	VkBlendOp blendOperationAlpha(int index) const;
 
-		bool alphaBlendActive(int index) const;
-		VkBlendFactor sourceBlendFactor(int index) const;
-		VkBlendFactor destBlendFactor(int index) const;
-		VkBlendOp blendOperation(int index) const;
+	BlendState blendState[RENDERTARGETS];
+};
 
-		VkBlendFactor sourceBlendFactorAlpha(int index) const;
-		VkBlendFactor destBlendFactorAlpha(int index) const;
-		VkBlendOp blendOperationAlpha(int index) const;
-
-		BlendState blendState[RENDERTARGETS];
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Context_hpp

diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
index a4478d1..f549769 100644
--- a/src/Device/LRUCache.hpp
+++ b/src/Device/LRUCache.hpp

@@ -20,180 +20,181 @@
 #include <type_traits>
 #include <unordered_map>
 
-namespace sw
+namespace sw {
+
+template<class Key, class Data>
+class LRUCache
 {
-	template<class Key, class Data>
-	class LRUCache
+public:
+	LRUCache(int n);
+
+	virtual ~LRUCache();
+
+	Data query(const Key &key) const;
+	virtual Data add(const Key &key, const Data &data);
+
+	int getSize() {return size;}
+	Key &getKey(int i) {return key[i];}
+
+protected:
+	int size;
+	int mask;
+	int top;
+	int fill;
+
+	Key *key;
+	Key **ref;
+	Data *data;
+};
+
+template<class Key, class Data, class Hasher = std::hash<Key>>
+class LRUConstCache : public LRUCache<Key, Data>
+{
+	using LRUBase = LRUCache<Key, Data>;
+public:
+	LRUConstCache(int n) : LRUBase(n) {}
+	~LRUConstCache() { clearConstCache(); }
+
+	Data add(const Key &key, const Data& data) override
 	{
-	public:
-		LRUCache(int n);
+		constCacheNeedsUpdate = true;
+		return LRUBase::add(key, data);
+	}
 
-		virtual ~LRUCache();
+	void updateConstCache();
+	const Data& queryConstCache(const Key &key) const;
 
-		Data query(const Key &key) const;
-		virtual Data add(const Key &key, const Data &data);
+private:
+	void clearConstCache();
+	bool constCacheNeedsUpdate = false;
+	std::unordered_map<Key, Data, Hasher> constCache;
+};
 
-		int getSize() {return size;}
-		Key &getKey(int i) {return key[i];}
-
-	protected:
-		int size;
-		int mask;
-		int top;
-		int fill;
-
-		Key *key;
-		Key **ref;
-		Data *data;
-	};
-
-	template<class Key, class Data, class Hasher = std::hash<Key>>
-	class LRUConstCache : public LRUCache<Key, Data>
-	{
-		using LRUBase = LRUCache<Key, Data>;
-	public:
-		LRUConstCache(int n) : LRUBase(n) {}
-		~LRUConstCache() { clearConstCache(); }
-
-		Data add(const Key &key, const Data& data) override
-		{
-			constCacheNeedsUpdate = true;
-			return LRUBase::add(key, data);
-		}
-
-		void updateConstCache();
-		const Data& queryConstCache(const Key &key) const;
-
-	private:
-		void clearConstCache();
-		bool constCacheNeedsUpdate = false;
-		std::unordered_map<Key, Data, Hasher> constCache;
-	};
-
-	// Traits-like helper class for checking if objects can be compared using memcmp().
-	// Useful for statically asserting if a cache key can implement operator==() with memcmp().
-	template<typename T>
-	struct is_memcmparable
-	{
-		// std::is_trivially_copyable is not available in older GCC versions.
-		#if !defined(__GNUC__) || __GNUC__ > 5
-			static const bool value = std::is_trivially_copyable<T>::value;
-		#else
-			// At least check it doesn't have virtual methods.
-			static const bool value = !std::is_polymorphic<T>::value;
-		#endif
-	};
+// Traits-like helper class for checking if objects can be compared using memcmp().
+// Useful for statically asserting if a cache key can implement operator==() with memcmp().
+template<typename T>
+struct is_memcmparable
+{
+	// std::is_trivially_copyable is not available in older GCC versions.
+	#if !defined(__GNUC__) || __GNUC__ > 5
+		static const bool value = std::is_trivially_copyable<T>::value;
+	#else
+		// At least check it doesn't have virtual methods.
+		static const bool value = !std::is_polymorphic<T>::value;
+	#endif
+};
 }
 
-namespace sw
+namespace sw {
+
+template<class Key, class Data>
+LRUCache<Key, Data>::LRUCache(int n)
 {
-	template<class Key, class Data>
-	LRUCache<Key, Data>::LRUCache(int n)
+	size = ceilPow2(n);
+	mask = size - 1;
+	top = 0;
+	fill = 0;
+
+	key = new Key[size];
+	ref = new Key*[size];
+	data = new Data[size];
+
+	for(int i = 0; i < size; i++)
 	{
-		size = ceilPow2(n);
-		mask = size - 1;
-		top = 0;
-		fill = 0;
+		ref[i] = &key[i];
+	}
+}
 
-		key = new Key[size];
-		ref = new Key*[size];
-		data = new Data[size];
+template<class Key, class Data>
+LRUCache<Key, Data>::~LRUCache()
+{
+	delete[] key;
+	key = nullptr;
 
-		for(int i = 0; i < size; i++)
+	delete[] ref;
+	ref = nullptr;
+
+	delete[] data;
+	data = nullptr;
+}
+
+template<class Key, class Data>
+Data LRUCache<Key, Data>::query(const Key &key) const
+{
+	for(int i = top; i > top - fill; i--)
+	{
+		int j = i & mask;
+
+		if(key == *ref[j])
 		{
-			ref[i] = &key[i];
+			Data hit = data[j];
+
+			if(i != top)
+			{
+				// Move one up
+				int k = (j + 1) & mask;
+
+				Data swapD = data[k];
+				data[k] = data[j];
+				data[j] = swapD;
+
+				Key *swapK = ref[k];
+				ref[k] = ref[j];
+				ref[j] = swapK;
+			}
+
+			return hit;
 		}
 	}
 
-	template<class Key, class Data>
-	LRUCache<Key, Data>::~LRUCache()
+	return {};   // Not found
+}
+
+template<class Key, class Data>
+Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
+{
+	top = (top + 1) & mask;
+	fill = fill + 1 < size ? fill + 1 : size;
+
+	*ref[top] = key;
+	this->data[top] = data;
+
+	return data;
+}
+
+template<class Key, class Data, class Hasher>
+void LRUConstCache<Key, Data, Hasher>::clearConstCache()
+{
+	constCache.clear();
+}
+
+template<class Key, class Data, class Hasher>
+void LRUConstCache<Key, Data, Hasher>::updateConstCache()
+{
+	if(constCacheNeedsUpdate)
 	{
-		delete[] key;
-		key = nullptr;
+		clearConstCache();
 
-		delete[] ref;
-		ref = nullptr;
-
-		delete[] data;
-		data = nullptr;
-	}
-
-	template<class Key, class Data>
-	Data LRUCache<Key, Data>::query(const Key &key) const
-	{
-		for(int i = top; i > top - fill; i--)
+		for(int i = 0; i < LRUBase::size; i++)
 		{
-			int j = i & mask;
-
-			if(key == *ref[j])
+			if(LRUBase::data[i])
 			{
-				Data hit = data[j];
-
-				if(i != top)
-				{
-					// Move one up
-					int k = (j + 1) & mask;
-
-					Data swapD = data[k];
-					data[k] = data[j];
-					data[j] = swapD;
-
-					Key *swapK = ref[k];
-					ref[k] = ref[j];
-					ref[j] = swapK;
-				}
-
-				return hit;
+				constCache[*LRUBase::ref[i]] = LRUBase::data[i];
 			}
 		}
 
-		return {};   // Not found
-	}
-
-	template<class Key, class Data>
-	Data LRUCache<Key, Data>::add(const Key &key, const Data &data)
-	{
-		top = (top + 1) & mask;
-		fill = fill + 1 < size ? fill + 1 : size;
-
-		*ref[top] = key;
-		this->data[top] = data;
-
-		return data;
-	}
-
-	template<class Key, class Data, class Hasher>
-	void LRUConstCache<Key, Data, Hasher>::clearConstCache()
-	{
-		constCache.clear();
-	}
-
-	template<class Key, class Data, class Hasher>
-	void LRUConstCache<Key, Data, Hasher>::updateConstCache()
-	{
-		if(constCacheNeedsUpdate)
-		{
-			clearConstCache();
-
-			for(int i = 0; i < LRUBase::size; i++)
-			{
-				if(LRUBase::data[i])
-				{
-					constCache[*LRUBase::ref[i]] = LRUBase::data[i];
-				}
-			}
-
-			constCacheNeedsUpdate = false;
-		}
-	}
-
-	template<class Key, class Data, class Hasher>
-	const Data& LRUConstCache<Key, Data, Hasher>::queryConstCache(const Key &key) const
-	{
-		auto it = constCache.find(key);
-		static Data null = {};
-		return (it != constCache.end()) ? it->second : null;
+		constCacheNeedsUpdate = false;
 	}
 }
 
+template<class Key, class Data, class Hasher>
+const Data& LRUConstCache<Key, Data, Hasher>::queryConstCache(const Key &key) const
+{
+	auto it = constCache.find(key);
+	static Data null = {};
+	return (it != constCache.end()) ? it->second : null;
+}
+
+}  // namespace sw
+
 #endif   // sw_LRUCache_hpp

diff --git a/src/Device/Matrix.cpp b/src/Device/Matrix.cpp
index f449841..006ca1b 100644
--- a/src/Device/Matrix.cpp
+++ b/src/Device/Matrix.cpp

@@ -17,386 +17,387 @@
 #include "Point.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+Matrix Matrix::diag(float m11, float m22, float m33, float m44)
 {
-	Matrix Matrix::diag(float m11, float m22, float m33, float m44)
-	{
-		return Matrix(m11, 0,   0,   0,
-		              0,   m22, 0,   0,
-		              0,   0,   m33, 0,
-		              0,   0,   0,   m44);
-	}
-
-	Matrix::operator float*()
-	{
-		return &(*this)(1, 1);
-	}
-
-	Matrix Matrix::operator+() const
-	{
-		return *this;
-	}
-
-	Matrix Matrix::operator-() const
-	{
-		const Matrix &M = *this;
-
-		return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4), 
-		              -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4), 
-		              -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4), 
-		              -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
-	}
-
-	Matrix Matrix::operator!() const
-	{
-		const Matrix &M = *this;
-		Matrix I;
-
-		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
-		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
-		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
-		float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
-		float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
-		float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
-		float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
-		float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
-		float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
-		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
-		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
-		float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
-		float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
-		float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
-		float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
-		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
-		float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
-		float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
-
-		// Adjoint Matrix
-		I(1, 1) =  M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
-		I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
-		I(3, 1) =  M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
-		I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
-
-		I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
-		I(2, 2) =  M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
-		I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
-		I(4, 2) =  M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
-
-		I(1, 3) =  M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
-		I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
-		I(3, 3) =  M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
-		I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
-
-		I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
-		I(2, 4) =  M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
-		I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
-		I(4, 4) =  M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
-
-		// Division by determinant
-		I /= M(1, 1) * I(1, 1) +
-		     M(2, 1) * I(1, 2) +
-		     M(3, 1) * I(1, 3) +
-		     M(4, 1) * I(1, 4);
-
-		return I;
-	}
-
-	Matrix Matrix::operator~() const
-	{
-		const Matrix &M = *this;
-
-		return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1), 
-		              M(1, 2), M(2, 2), M(3, 2), M(4, 2), 
-		              M(1, 3), M(2, 3), M(3, 3), M(4, 3), 
-		              M(1, 4), M(2, 4), M(3, 4), M(4, 4));
-	}
-
-	Matrix &Matrix::operator+=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
-		M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
-		M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
-		M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
-
-		return M;
-	}
-
-	Matrix &Matrix::operator-=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
-		M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
-		M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
-		M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
-
-		return M;
-	}
-
-	Matrix &Matrix::operator*=(float s)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
-		M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
-		M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
-		M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
-
-		return M;
-	}
-
-	Matrix &Matrix::operator*=(const Matrix &M)
-	{
-		return *this = *this * M;
-	}
-
-	Matrix &Matrix::operator/=(float s)
-	{
-		float r = 1.0f / s;
-
-		return *this *= r;
-	}
-
-	bool operator==(const Matrix &M, const Matrix &N)
-	{
-		if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
-		   M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
-		   M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
-		   M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
-			return true;
-		else
-			return false;
-	}
-
-	bool operator!=(const Matrix &M, const Matrix &N)
-	{
-		if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
-		   M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
-		   M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
-		   M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
-			return true;
-		else
-			return false;
-	}
-
-	Matrix operator+(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4), 
-		              M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4), 
-		              M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4), 
-		              M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
-	}
-
-	Matrix operator-(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4), 
-		              M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4), 
-		              M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4), 
-		              M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
-	}
-
-	Matrix operator*(float s, const Matrix &M)
-	{
-		return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4), 
-		              s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4), 
-		              s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4), 
-		              s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
-	}
-
-	Matrix operator*(const Matrix &M, float s)
-	{
-		return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s, 
-		              M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s, 
-		              M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s, 
-		              M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
-	}
-
-	Matrix operator*(const Matrix &M, const Matrix &N)
-	{
-		return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4), 
-		              M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4), 
-		              M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4), 
-		              M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
-	}
-
-	Matrix operator/(const Matrix &M, float s)
-	{
-		float r = 1.0f / s;
-
-		return M * r;
-	}
-
-	float4 Matrix::operator*(const float4 &v) const
-	{
-		const Matrix &M = *this;
-		float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
-		float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
-		float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
-		float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
-
-		return {Mx, My, Mz, Mw};
-	}
-
-	float Matrix::det(const Matrix &M)
-	{
-		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
-		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
-		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
-		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
-		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
-		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
-
-		return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
-		       M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
-		       M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
-		       M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
-	}
-
-	float Matrix::det(float m11)
-	{
-		return m11;
-	}
-
-	float Matrix::det(float m11, float m12, 
-	                  float m21, float m22)
-	{
-		return m11 * m22 - m12 * m21; 
-	}
-
-	float Matrix::det(float m11, float m12, float m13, 
-	                  float m21, float m22, float m23, 
-	                  float m31, float m32, float m33)
-	{
-		return m11 * (m22 * m33 - m32 * m23) -
-		       m21 * (m12 * m33 - m32 * m13) +
-		       m31 * (m12 * m23 - m22 * m13);
-	}
-
-	float Matrix::det(float m11, float m12, float m13, float m14, 
-	                  float m21, float m22, float m23, float m24, 
-	                  float m31, float m32, float m33, float m34, 
-	                  float m41, float m42, float m43, float m44)
-	{
-		float M3344 = m33 * m44 - m43 * m34;
-		float M2344 = m23 * m44 - m43 * m24;
-		float M2334 = m23 * m34 - m33 * m24;
-		float M1344 = m13 * m44 - m43 * m14;
-		float M1334 = m13 * m34 - m33 * m14;
-		float M1324 = m13 * m24 - m23 * m14;
-
-		return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
-		       m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
-		       m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
-		       m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
-	}
-
-	float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
-	{
-		return v1 * (v2 % v3);
-	}
-
-	float Matrix::det3(const Matrix &M)
-	{
-		return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
-		       M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
-		       M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
-	}
-
-	float Matrix::tr(const Matrix &M)
-	{
-		return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
-	}
-
-	Matrix &Matrix::orthogonalise()
-	{
-		// NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
-
-		Matrix &M = *this;
-
-		Vector v1(M(1, 1), M(2, 1), M(3, 1));
-		Vector v2(M(1, 2), M(2, 2), M(3, 2));
-		Vector v3(M(1, 3), M(2, 3), M(3, 3));
-
-		v2 -= v1 * (v1 * v2) / (v1 * v1);
-		v3 -= v1 * (v1 * v3) / (v1 * v1);
-		v3 -= v2 * (v2 * v3) / (v2 * v2);
-
-		v1 /= Vector::N(v1);
-		v2 /= Vector::N(v2);
-		v3 /= Vector::N(v3);
-
-		M(1, 1) = v1.x;  M(1, 2) = v2.x;  M(1, 3) = v3.x;
-		M(2, 1) = v1.y;  M(2, 2) = v2.y;  M(2, 3) = v3.y;
-		M(3, 1) = v1.z;  M(3, 2) = v2.z;  M(3, 3) = v3.z;
-
-		return *this;
-	}
-
-	Matrix Matrix::eulerRotate(const Vector &v)
-	{
-		float cz = cos(v.z);
-		float sz = sin(v.z);
-		float cx = cos(v.x);
-		float sx = sin(v.x);
-		float cy = cos(v.y);
-		float sy = sin(v.y);
-
-		float sxsy = sx * sy;
-		float sxcy = sx * cy;
-
-		return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
-		              cx * sz,              cx * cz,             -sx,
-		              sy * cz + sxcy * sz, -sy * sz + sxcy * cz,  cy * cx);
-	}
-
-	Matrix Matrix::eulerRotate(float x, float y, float z)
-	{
-		return eulerRotate(Vector(x, y, z));
-	}
-
-	Matrix Matrix::translate(const Vector &v)
-	{
-		return Matrix(1, 0, 0, v.x,
-		              0, 1, 0, v.y,
-		              0, 0, 1, v.z,
-		              0, 0, 0, 1);
-	}
-
-	Matrix Matrix::translate(float x, float y, float z)
-	{
-		return translate(Vector(x, y, z));
-	}
-
-	Matrix Matrix::scale(const Vector &v)
-	{
-		return Matrix(v.x, 0,   0,
-		              0,   v.y, 0,
-		              0,   0,   v.z);
-	}
-
-	Matrix Matrix::scale(float x, float y, float z)
-	{
-		return scale(Vector(x, y, z));
-	}
-
-	Matrix Matrix::lookAt(const Vector &v)
-	{
-		Vector y = v;
-		y /= Vector::N(y);
-
-		Vector x = y % Vector(0, 0, 1);
-		x /= Vector::N(x);
-
-		Vector z = x % y;
-		z /= Vector::N(z);
-
-		return ~Matrix(x, y, z);
-	}
-
-	Matrix Matrix::lookAt(float x, float y, float z)
-	{
-		return translate(Vector(x, y, z));
-	}
+	return Matrix(m11, 0,   0,   0,
+	              0,   m22, 0,   0,
+	              0,   0,   m33, 0,
+	              0,   0,   0,   m44);
 }
+
+Matrix::operator float*()
+{
+	return &(*this)(1, 1);
+}
+
+Matrix Matrix::operator+() const
+{
+	return *this;
+}
+
+Matrix Matrix::operator-() const
+{
+	const Matrix &M = *this;
+
+	return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4), 
+	              -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4), 
+	              -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4), 
+	              -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
+}
+
+Matrix Matrix::operator!() const
+{
+	const Matrix &M = *this;
+	Matrix I;
+
+	float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+	float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+	float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+	float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
+	float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
+	float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
+	float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
+	float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
+	float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
+	float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+	float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+	float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
+	float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
+	float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
+	float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
+	float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+	float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
+	float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
+
+	// Adjoint Matrix
+	I(1, 1) =  M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
+	I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
+	I(3, 1) =  M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
+	I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
+
+	I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
+	I(2, 2) =  M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
+	I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
+	I(4, 2) =  M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
+
+	I(1, 3) =  M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
+	I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
+	I(3, 3) =  M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
+	I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
+
+	I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
+	I(2, 4) =  M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
+	I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
+	I(4, 4) =  M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
+
+	// Division by determinant
+	I /= M(1, 1) * I(1, 1) +
+	     M(2, 1) * I(1, 2) +
+	     M(3, 1) * I(1, 3) +
+	     M(4, 1) * I(1, 4);
+
+	return I;
+}
+
+Matrix Matrix::operator~() const
+{
+	const Matrix &M = *this;
+
+	return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1), 
+	              M(1, 2), M(2, 2), M(3, 2), M(4, 2), 
+	              M(1, 3), M(2, 3), M(3, 3), M(4, 3), 
+	              M(1, 4), M(2, 4), M(3, 4), M(4, 4));
+}
+
+Matrix &Matrix::operator+=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
+	M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
+	M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
+	M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
+
+	return M;
+}
+
+Matrix &Matrix::operator-=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
+	M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
+	M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
+	M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
+
+	return M;
+}
+
+Matrix &Matrix::operator*=(float s)
+{
+	Matrix &M = *this;
+
+	M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
+	M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
+	M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
+	M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
+
+	return M;
+}
+
+Matrix &Matrix::operator*=(const Matrix &M)
+{
+	return *this = *this * M;
+}
+
+Matrix &Matrix::operator/=(float s)
+{
+	float r = 1.0f / s;
+
+	return *this *= r;
+}
+
+bool operator==(const Matrix &M, const Matrix &N)
+{
+	if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
+	   M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
+	   M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
+	   M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
+		return true;
+	else
+		return false;
+}
+
+bool operator!=(const Matrix &M, const Matrix &N)
+{
+	if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
+	   M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
+	   M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
+	   M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
+		return true;
+	else
+		return false;
+}
+
+Matrix operator+(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4), 
+	              M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4), 
+	              M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4), 
+	              M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
+}
+
+Matrix operator-(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4), 
+	              M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4), 
+	              M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4), 
+	              M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
+}
+
+Matrix operator*(float s, const Matrix &M)
+{
+	return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4), 
+	              s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4), 
+	              s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4), 
+	              s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
+}
+
+Matrix operator*(const Matrix &M, float s)
+{
+	return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s, 
+	              M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s, 
+	              M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s, 
+	              M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
+}
+
+Matrix operator*(const Matrix &M, const Matrix &N)
+{
+	return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4), 
+	              M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4), 
+	              M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4), 
+	              M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
+}
+
+Matrix operator/(const Matrix &M, float s)
+{
+	float r = 1.0f / s;
+
+	return M * r;
+}
+
+float4 Matrix::operator*(const float4 &v) const
+{
+	const Matrix &M = *this;
+	float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
+	float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
+	float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
+	float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
+
+	return {Mx, My, Mz, Mw};
+}
+
+float Matrix::det(const Matrix &M)
+{
+	float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+	float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+	float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+	float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+	float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+	float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+
+	return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
+	       M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
+	       M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
+	       M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
+}
+
+float Matrix::det(float m11)
+{
+	return m11;
+}
+
+float Matrix::det(float m11, float m12, 
+                  float m21, float m22)
+{
+	return m11 * m22 - m12 * m21; 
+}
+
+float Matrix::det(float m11, float m12, float m13, 
+                  float m21, float m22, float m23, 
+                  float m31, float m32, float m33)
+{
+	return m11 * (m22 * m33 - m32 * m23) -
+	       m21 * (m12 * m33 - m32 * m13) +
+	       m31 * (m12 * m23 - m22 * m13);
+}
+
+float Matrix::det(float m11, float m12, float m13, float m14, 
+                  float m21, float m22, float m23, float m24, 
+                  float m31, float m32, float m33, float m34, 
+                  float m41, float m42, float m43, float m44)
+{
+	float M3344 = m33 * m44 - m43 * m34;
+	float M2344 = m23 * m44 - m43 * m24;
+	float M2334 = m23 * m34 - m33 * m24;
+	float M1344 = m13 * m44 - m43 * m14;
+	float M1334 = m13 * m34 - m33 * m14;
+	float M1324 = m13 * m24 - m23 * m14;
+
+	return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
+	       m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
+	       m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
+	       m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
+}
+
+float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
+{
+	return v1 * (v2 % v3);
+}
+
+float Matrix::det3(const Matrix &M)
+{
+	return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
+	       M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
+	       M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
+}
+
+float Matrix::tr(const Matrix &M)
+{
+	return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
+}
+
+Matrix &Matrix::orthogonalise()
+{
+	// NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
+
+	Matrix &M = *this;
+
+	Vector v1(M(1, 1), M(2, 1), M(3, 1));
+	Vector v2(M(1, 2), M(2, 2), M(3, 2));
+	Vector v3(M(1, 3), M(2, 3), M(3, 3));
+
+	v2 -= v1 * (v1 * v2) / (v1 * v1);
+	v3 -= v1 * (v1 * v3) / (v1 * v1);
+	v3 -= v2 * (v2 * v3) / (v2 * v2);
+
+	v1 /= Vector::N(v1);
+	v2 /= Vector::N(v2);
+	v3 /= Vector::N(v3);
+
+	M(1, 1) = v1.x;  M(1, 2) = v2.x;  M(1, 3) = v3.x;
+	M(2, 1) = v1.y;  M(2, 2) = v2.y;  M(2, 3) = v3.y;
+	M(3, 1) = v1.z;  M(3, 2) = v2.z;  M(3, 3) = v3.z;
+
+	return *this;
+}
+
+Matrix Matrix::eulerRotate(const Vector &v)
+{
+	float cz = cos(v.z);
+	float sz = sin(v.z);
+	float cx = cos(v.x);
+	float sx = sin(v.x);
+	float cy = cos(v.y);
+	float sy = sin(v.y);
+
+	float sxsy = sx * sy;
+	float sxcy = sx * cy;
+
+	return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
+	              cx * sz,              cx * cz,             -sx,
+	              sy * cz + sxcy * sz, -sy * sz + sxcy * cz,  cy * cx);
+}
+
+Matrix Matrix::eulerRotate(float x, float y, float z)
+{
+	return eulerRotate(Vector(x, y, z));
+}
+
+Matrix Matrix::translate(const Vector &v)
+{
+	return Matrix(1, 0, 0, v.x,
+	              0, 1, 0, v.y,
+	              0, 0, 1, v.z,
+	              0, 0, 0, 1);
+}
+
+Matrix Matrix::translate(float x, float y, float z)
+{
+	return translate(Vector(x, y, z));
+}
+
+Matrix Matrix::scale(const Vector &v)
+{
+	return Matrix(v.x, 0,   0,
+	              0,   v.y, 0,
+	              0,   0,   v.z);
+}
+
+Matrix Matrix::scale(float x, float y, float z)
+{
+	return scale(Vector(x, y, z));
+}
+
+Matrix Matrix::lookAt(const Vector &v)
+{
+	Vector y = v;
+	y /= Vector::N(y);
+
+	Vector x = y % Vector(0, 0, 1);
+	x /= Vector::N(x);
+
+	Vector z = x % y;
+	z /= Vector::N(z);
+
+	return ~Matrix(x, y, z);
+}
+
+Matrix Matrix::lookAt(float x, float y, float z)
+{
+	return translate(Vector(x, y, z));
+}
+
+}  // namespace sw

diff --git a/src/Device/Matrix.hpp b/src/Device/Matrix.hpp
index 41281a6..e4f5ecc 100644
--- a/src/Device/Matrix.hpp
+++ b/src/Device/Matrix.hpp

@@ -15,203 +15,204 @@
 #ifndef Matrix_hpp
 #define Matrix_hpp
 
-namespace sw
+namespace sw {
+
+struct Vector;
+struct Point;
+struct float4;
+
+struct Matrix
 {
-	struct Vector;
-	struct Point;
-	struct float4;
+	Matrix();
+	Matrix(const int i);
+	Matrix(const float m[16]);
+	Matrix(const float m[4][4]);
+	Matrix(float m11, float m12, float m13,
+	       float m21, float m22, float m23,
+	       float m31, float m32, float m33);
+	Matrix(float m11, float m12, float m13, float m14,
+	       float m21, float m22, float m23, float m24,
+	       float m31, float m32, float m33, float m34,
+	       float m41, float m42, float m43, float m44);
+	Matrix(const Vector &v1, const Vector &v2, const Vector &v3);   // Column vectors
 
-	struct Matrix
-	{
-		Matrix();
-		Matrix(const int i);
-		Matrix(const float m[16]);
-		Matrix(const float m[4][4]);
-		Matrix(float m11, float m12, float m13,
-		       float m21, float m22, float m23,
-		       float m31, float m32, float m33);
-		Matrix(float m11, float m12, float m13, float m14,
-		       float m21, float m22, float m23, float m24,
-		       float m31, float m32, float m33, float m34,
-		       float m41, float m42, float m43, float m44);
-		Matrix(const Vector &v1, const Vector &v2, const Vector &v3);   // Column vectors
+	Matrix &operator=(const Matrix &N);
 
-		Matrix &operator=(const Matrix &N);
+	// Row major order
+	float m[4][4];
 
-		// Row major order
-		float m[4][4];
+	static Matrix diag(float m11, float m22, float m33, float m44);
 
-		static Matrix diag(float m11, float m22, float m33, float m44);
+	operator float*();
 
-		operator float*();
+	Matrix operator+() const;
+	Matrix operator-() const;
 
-		Matrix operator+() const;
-		Matrix operator-() const;
+	Matrix operator!() const;   // Inverse
+	Matrix operator~() const;   // Transpose
 
-		Matrix operator!() const;   // Inverse
-		Matrix operator~() const;   // Transpose
+	Matrix &operator+=(const Matrix &N);
+	Matrix &operator-=(const Matrix &N);
+	Matrix &operator*=(float s);
+	Matrix &operator*=(const Matrix &N);
+	Matrix &operator/=(float s);
 
-		Matrix &operator+=(const Matrix &N);
-		Matrix &operator-=(const Matrix &N);
-		Matrix &operator*=(float s);
-		Matrix &operator*=(const Matrix &N);
-		Matrix &operator/=(float s);
+	float *operator[](int i);   // Access element [row][col], starting with [0][0]
+	const float *operator[](int i) const;
 
-		float *operator[](int i);   // Access element [row][col], starting with [0][0]
-		const float *operator[](int i) const;
+	float &operator()(int i, int j);   // Access element (row, col), starting with (1, 1)
+	const float &operator()(int i, int j) const;
 
-		float &operator()(int i, int j);   // Access element (row, col), starting with (1, 1)
-		const float &operator()(int i, int j) const;
+	friend bool operator==(const Matrix &M, const Matrix &N);
+	friend bool operator!=(const Matrix &M, const Matrix &N);
 
-		friend bool operator==(const Matrix &M, const Matrix &N);
-		friend bool operator!=(const Matrix &M, const Matrix &N);
+	friend Matrix operator+(const Matrix &M, const Matrix &N);
+	friend Matrix operator-(const Matrix &M, const Matrix &N);
+	friend Matrix operator*(float s, const Matrix &M);
+	friend Matrix operator*(const Matrix &M, const Matrix &N);
+	friend Matrix operator/(const Matrix &M, float s);
 
-		friend Matrix operator+(const Matrix &M, const Matrix &N);
-		friend Matrix operator-(const Matrix &M, const Matrix &N);
-		friend Matrix operator*(float s, const Matrix &M);
-		friend Matrix operator*(const Matrix &M, const Matrix &N);
-		friend Matrix operator/(const Matrix &M, float s);
+	float4 operator*(const float4 &v) const;
 
-		float4 operator*(const float4 &v) const;
+	static float det(const Matrix &M);
+	static float det(float m11);
+	static float det(float m11, float m12,
+	                 float m21, float m22);
+	static float det(float m11, float m12, float m13,
+	                 float m21, float m22, float m23,
+	                 float m31, float m32, float m33);
+	static float det(float m11, float m12, float m13, float m14,
+	                 float m21, float m22, float m23, float m24,
+	                 float m31, float m32, float m33, float m34,
+	                 float m41, float m42, float m43, float m44);
+	static float det(const Vector &v1, const Vector &v2, const Vector &v3);
+	static float det3(const Matrix &M);
 
-		static float det(const Matrix &M);
-		static float det(float m11);
-		static float det(float m11, float m12,
-		                 float m21, float m22);
-		static float det(float m11, float m12, float m13,
-		                 float m21, float m22, float m23,
-		                 float m31, float m32, float m33);
-		static float det(float m11, float m12, float m13, float m14,
-		                 float m21, float m22, float m23, float m24,
-		                 float m31, float m32, float m33, float m34,
-		                 float m41, float m42, float m43, float m44);
-		static float det(const Vector &v1, const Vector &v2, const Vector &v3);
-		static float det3(const Matrix &M);
+	static float tr(const Matrix &M);
 
-		static float tr(const Matrix &M);
+	Matrix &orthogonalise();   // Gram-Schmidt orthogonalisation of 3x3 submatrix
 
-		Matrix &orthogonalise();   // Gram-Schmidt orthogonalisation of 3x3 submatrix
+	static Matrix eulerRotate(const Vector &v);
+	static Matrix eulerRotate(float x, float y, float z);
 
-		static Matrix eulerRotate(const Vector &v);
-		static Matrix eulerRotate(float x, float y, float z);
+	static Matrix translate(const Vector &v);
+	static Matrix translate(float x, float y, float z);
 	
-		static Matrix translate(const Vector &v);
-		static Matrix translate(float x, float y, float z);
-		
-		static Matrix scale(const Vector &v);
-		static Matrix scale(float x, float y, float z);
+	static Matrix scale(const Vector &v);
+	static Matrix scale(float x, float y, float z);
 
-		static Matrix lookAt(const Vector &v);
-		static Matrix lookAt(float x, float y, float z);
-	};
+	static Matrix lookAt(const Vector &v);
+	static Matrix lookAt(float x, float y, float z);
+};
 }
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+inline Matrix::Matrix()
 {
-	inline Matrix::Matrix()
-	{
-	}
-
-	inline Matrix::Matrix(const int i)
-	{
-		const float s = (float)i;
-
-		Matrix &M = *this;
-
-		M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
-		M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
-		M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
-		M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
-	}
-
-	inline Matrix::Matrix(const float m[16])
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m[0];  M(1, 2) = m[1];  M(1, 3) = m[2];  M(1, 4) = m[3];
-		M(2, 1) = m[4];  M(2, 2) = m[5];  M(2, 3) = m[6];  M(2, 4) = m[7];
-		M(3, 1) = m[8];  M(3, 2) = m[8];  M(3, 3) = m[10]; M(3, 4) = m[11];
-		M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
-	}
-
-	inline Matrix::Matrix(const float m[4][4])
-	{
-		Matrix &M = *this;
-
-		M[0][0] = m[0][0];  M[0][1] = m[0][1];  M[0][2] = m[0][2];  M[0][3] = m[0][3];
-		M[1][0] = m[1][0];  M[1][1] = m[1][1];  M[1][2] = m[1][2];  M[1][3] = m[1][3];
-		M[2][0] = m[2][0];  M[2][1] = m[2][1];  M[2][2] = m[2][2];  M[2][3] = m[2][3];
-		M[3][0] = m[3][0];  M[3][1] = m[3][1];  M[3][2] = m[3][2];  M[3][3] = m[3][3];
-	}
-
-	inline Matrix::Matrix(float m11, float m12, float m13, 
-	                      float m21, float m22, float m23, 
-	                      float m31, float m32, float m33)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
-		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
-		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
-		M(4, 1) = 0;   M(4, 2) = 0;   M(4, 3) = 0;   M(4, 4) = 1;
-	}
-
-	inline Matrix::Matrix(float m11, float m12, float m13, float m14, 
-	                      float m21, float m22, float m23, float m24, 
-	                      float m31, float m32, float m33, float m34, 
-	                      float m41, float m42, float m43, float m44)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
-		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
-		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
-		M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
-	}
-
-	inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
-		M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
-		M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
-		M(4, 1) = 0;    M(4, 2) = 0;    M(4, 3) = 0;    M(4, 4) = 1;
-	}
-
-	inline Matrix &Matrix::operator=(const Matrix &N)
-	{
-		Matrix &M = *this;
-
-		M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
-		M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
-		M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
-		M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
-
-		return M;
-	}
-
-	inline float *Matrix::operator[](int i)
-	{
-		return m[i];
-	}
-
-	inline const float *Matrix::operator[](int i) const
-	{
-		return m[i];
-	}
-
-	inline float &Matrix::operator()(int i, int j)
-	{
-		return m[i - 1][j - 1];
-	}
-
-	inline const float &Matrix::operator()(int i, int j) const
-	{
-		return m[i - 1][j - 1];
-	}
 }
 
+inline Matrix::Matrix(const int i)
+{
+	const float s = (float)i;
+
+	Matrix &M = *this;
+
+	M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
+	M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
+	M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
+	M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
+}
+
+inline Matrix::Matrix(const float m[16])
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m[0];  M(1, 2) = m[1];  M(1, 3) = m[2];  M(1, 4) = m[3];
+	M(2, 1) = m[4];  M(2, 2) = m[5];  M(2, 3) = m[6];  M(2, 4) = m[7];
+	M(3, 1) = m[8];  M(3, 2) = m[8];  M(3, 3) = m[10]; M(3, 4) = m[11];
+	M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
+}
+
+inline Matrix::Matrix(const float m[4][4])
+{
+	Matrix &M = *this;
+
+	M[0][0] = m[0][0];  M[0][1] = m[0][1];  M[0][2] = m[0][2];  M[0][3] = m[0][3];
+	M[1][0] = m[1][0];  M[1][1] = m[1][1];  M[1][2] = m[1][2];  M[1][3] = m[1][3];
+	M[2][0] = m[2][0];  M[2][1] = m[2][1];  M[2][2] = m[2][2];  M[2][3] = m[2][3];
+	M[3][0] = m[3][0];  M[3][1] = m[3][1];  M[3][2] = m[3][2];  M[3][3] = m[3][3];
+}
+
+inline Matrix::Matrix(float m11, float m12, float m13, 
+                      float m21, float m22, float m23, 
+                      float m31, float m32, float m33)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
+	M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
+	M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
+	M(4, 1) = 0;   M(4, 2) = 0;   M(4, 3) = 0;   M(4, 4) = 1;
+}
+
+inline Matrix::Matrix(float m11, float m12, float m13, float m14, 
+                      float m21, float m22, float m23, float m24, 
+                      float m31, float m32, float m33, float m34, 
+                      float m41, float m42, float m43, float m44)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
+	M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
+	M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
+	M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
+}
+
+inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
+	M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
+	M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
+	M(4, 1) = 0;    M(4, 2) = 0;    M(4, 3) = 0;    M(4, 4) = 1;
+}
+
+inline Matrix &Matrix::operator=(const Matrix &N)
+{
+	Matrix &M = *this;
+
+	M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
+	M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
+	M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
+	M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
+
+	return M;
+}
+
+inline float *Matrix::operator[](int i)
+{
+	return m[i];
+}
+
+inline const float *Matrix::operator[](int i) const
+{
+	return m[i];
+}
+
+inline float &Matrix::operator()(int i, int j)
+{
+	return m[i - 1][j - 1];
+}
+
+inline const float &Matrix::operator()(int i, int j) const
+{
+	return m[i - 1][j - 1];
+}
+
+}  // namespace sw
+
 #endif   // Matrix_hpp

diff --git a/src/Device/Memset.hpp b/src/Device/Memset.hpp
index 8c015fa..9db5d47 100644
--- a/src/Device/Memset.hpp
+++ b/src/Device/Memset.hpp

@@ -18,35 +18,35 @@
 #include <cstring>
 #include <type_traits>
 
-namespace sw
+namespace sw {
+
+// Helper class for clearing the memory of objects at construction.
+// Useful as the first base class of cache keys which may contain padding
+// bytes or bits otherwise left uninitialized.
+template<class T>
+struct Memset
 {
-	// Helper class for clearing the memory of objects at construction.
-	// Useful as the first base class of cache keys which may contain padding
-	// bytes or bits otherwise left uninitialized.
-	template<class T>
-	struct Memset
+	Memset(T *object, int val)
 	{
-		Memset(T *object, int val)
-		{
-			static_assert(std::is_base_of<Memset<T>, T>::value, "Memset<T> must only clear the memory of a type of which it is a base class");
+		static_assert(std::is_base_of<Memset<T>, T>::value, "Memset<T> must only clear the memory of a type of which it is a base class");
 
-			// GCC 8+ warns that
-			// "‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘T’;
-			//  use assignment or value-initialization instead [-Werror=class-memaccess]"
-			// This is benign iff it happens before any of the base or member constructrs are called.
-			#if defined(__GNUC__) && (__GNUC__ >= 8)
-			#pragma GCC diagnostic push
-			#pragma GCC diagnostic ignored "-Wclass-memaccess"
-			#endif
+		// GCC 8+ warns that
+		// "‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘T’;
+		//  use assignment or value-initialization instead [-Werror=class-memaccess]"
+		// This is benign iff it happens before any of the base or member constructrs are called.
+		#if defined(__GNUC__) && (__GNUC__ >= 8)
+		#pragma GCC diagnostic push
+		#pragma GCC diagnostic ignored "-Wclass-memaccess"
+		#endif
 
-			memset(object, 0, sizeof(T));
+		memset(object, 0, sizeof(T));
 
-			#if defined(__GNUC__) && (__GNUC__ >= 8)
-			#pragma GCC diagnostic pop
-			#endif
-		}
-	};
+		#if defined(__GNUC__) && (__GNUC__ >= 8)
+		#pragma GCC diagnostic pop
+		#endif
+	}
+};
 
-}
+}  // namespace sw
 
 #endif   // sw_Memset_hpp
\ No newline at end of file

diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
index 26731a2..a1e8df1 100644
--- a/src/Device/PixelProcessor.cpp
+++ b/src/Device/PixelProcessor.cpp

@@ -22,211 +22,212 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+uint32_t PixelProcessor::States::computeHash()
 {
-	uint32_t PixelProcessor::States::computeHash()
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
 	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
+		hash ^= state[i];
 	}
 
-	bool PixelProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	PixelProcessor::PixelProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	PixelProcessor::~PixelProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
-	{
-		// TODO(b/140935644): Compact into generic function, cheack if clamp is required
-		factor.blendConstant4W[0][0] =
-		factor.blendConstant4W[0][1] =
-		factor.blendConstant4W[0][2] =
-		factor.blendConstant4W[0][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.r));
-
-		factor.blendConstant4W[1][0] =
-		factor.blendConstant4W[1][1] =
-		factor.blendConstant4W[1][2] =
-		factor.blendConstant4W[1][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.g));
-
-		factor.blendConstant4W[2][0] =
-		factor.blendConstant4W[2][1] =
-		factor.blendConstant4W[2][2] =
-		factor.blendConstant4W[2][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.b));
-
-		factor.blendConstant4W[3][0] =
-		factor.blendConstant4W[3][1] =
-		factor.blendConstant4W[3][2] =
-		factor.blendConstant4W[3][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.a));
-
-		factor.invBlendConstant4W[0][0] =
-		factor.invBlendConstant4W[0][1] =
-		factor.invBlendConstant4W[0][2] =
-		factor.invBlendConstant4W[0][3] = 0xFFFFu - factor.blendConstant4W[0][0];
-
-		factor.invBlendConstant4W[1][0] =
-		factor.invBlendConstant4W[1][1] =
-		factor.invBlendConstant4W[1][2] =
-		factor.invBlendConstant4W[1][3] = 0xFFFFu - factor.blendConstant4W[1][0];
-
-		factor.invBlendConstant4W[2][0] =
-		factor.invBlendConstant4W[2][1] =
-		factor.invBlendConstant4W[2][2] =
-		factor.invBlendConstant4W[2][3] = 0xFFFFu - factor.blendConstant4W[2][0];
-
-		factor.invBlendConstant4W[3][0] =
-		factor.invBlendConstant4W[3][1] =
-		factor.invBlendConstant4W[3][2] =
-		factor.invBlendConstant4W[3][3] = 0xFFFFu - factor.blendConstant4W[3][0];
-
-		factor.blendConstant4F[0][0] =
-		factor.blendConstant4F[0][1] =
-		factor.blendConstant4F[0][2] =
-		factor.blendConstant4F[0][3] = blendConstant.r;
-
-		factor.blendConstant4F[1][0] =
-		factor.blendConstant4F[1][1] =
-		factor.blendConstant4F[1][2] =
-		factor.blendConstant4F[1][3] = blendConstant.g;
-
-		factor.blendConstant4F[2][0] =
-		factor.blendConstant4F[2][1] =
-		factor.blendConstant4F[2][2] =
-		factor.blendConstant4F[2][3] = blendConstant.b;
-
-		factor.blendConstant4F[3][0] =
-		factor.blendConstant4F[3][1] =
-		factor.blendConstant4F[3][2] =
-		factor.blendConstant4F[3][3] = blendConstant.a;
-
-		factor.invBlendConstant4F[0][0] =
-		factor.invBlendConstant4F[0][1] =
-		factor.invBlendConstant4F[0][2] =
-		factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
-
-		factor.invBlendConstant4F[1][0] =
-		factor.invBlendConstant4F[1][1] =
-		factor.invBlendConstant4F[1][2] =
-		factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
-
-		factor.invBlendConstant4F[2][0] =
-		factor.invBlendConstant4F[2][1] =
-		factor.invBlendConstant4F[2][2] =
-		factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
-
-		factor.invBlendConstant4F[3][0] =
-		factor.invBlendConstant4F[3][1] =
-		factor.invBlendConstant4F[3][2] =
-		factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
-	}
-
-	void PixelProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
-
-	const PixelProcessor::State PixelProcessor::update(const Context* context) const
-	{
-		State state;
-
-		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
-		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
-
-		if(context->pixelShader)
-		{
-			state.shaderID = context->pixelShader->getSerialID();
-		}
-		else
-		{
-			state.shaderID = 0;
-		}
-
-		state.alphaToCoverage = context->alphaToCoverage;
-		state.depthWriteEnable = context->depthWriteActive();
-
-		if(context->stencilActive())
-		{
-			state.stencilActive = true;
-			state.frontStencil = context->frontStencil;
-			state.backStencil = context->backStencil;
-		}
-
-		if(context->depthBufferActive())
-		{
-			state.depthTestActive = true;
-			state.depthCompareMode = context->depthCompareMode;
-			state.depthFormat = context->depthBuffer->getFormat();
-		}
-
-		state.occlusionEnabled = context->occlusionEnabled;
-		state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
-
-		for(int i = 0; i < RENDERTARGETS; i++)
-		{
-			state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
-			state.targetFormat[i] = context->renderTargetInternalFormat(i);
-			state.blendState[i] = context->getBlendState(i);
-		}
-
-		state.multiSample = static_cast<unsigned int>(context->sampleCount);
-		state.multiSampleMask = context->multiSampleMask;
-		state.multiSampledBresenham = (state.multiSample > 1) && context->isDrawLine(true) &&
-		                              (context->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT);
-
-		if(state.multiSample > 1 && context->pixelShader)
-		{
-			state.centroid = context->pixelShader->getModes().NeedsCentroid;
-		}
-
-		state.frontFace = context->frontFace;
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	PixelProcessor::RoutineType PixelProcessor::routine(const State &state,
-		vk::PipelineLayout const *pipelineLayout,
-		SpirvShader const *pixelShader,
-		const vk::DescriptorSet::Bindings &descriptorSets)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)
-		{
-			QuadRasterizer *generator = new PixelProgram(state, pipelineLayout, pixelShader, descriptorSets);
-			generator->generate();
-			routine = (*generator)("PixelRoutine_%0.8X", state.shaderID);
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
-	}
+	return hash;
 }
+
+bool PixelProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp State");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+PixelProcessor::PixelProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+PixelProcessor::~PixelProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
+{
+	// TODO(b/140935644): Compact into generic function, cheack if clamp is required
+	factor.blendConstant4W[0][0] =
+	factor.blendConstant4W[0][1] =
+	factor.blendConstant4W[0][2] =
+	factor.blendConstant4W[0][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.r));
+
+	factor.blendConstant4W[1][0] =
+	factor.blendConstant4W[1][1] =
+	factor.blendConstant4W[1][2] =
+	factor.blendConstant4W[1][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.g));
+
+	factor.blendConstant4W[2][0] =
+	factor.blendConstant4W[2][1] =
+	factor.blendConstant4W[2][2] =
+	factor.blendConstant4W[2][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.b));
+
+	factor.blendConstant4W[3][0] =
+	factor.blendConstant4W[3][1] =
+	factor.blendConstant4W[3][2] =
+	factor.blendConstant4W[3][3] = static_cast<uint16_t>(iround(65535.0f * blendConstant.a));
+
+	factor.invBlendConstant4W[0][0] =
+	factor.invBlendConstant4W[0][1] =
+	factor.invBlendConstant4W[0][2] =
+	factor.invBlendConstant4W[0][3] = 0xFFFFu - factor.blendConstant4W[0][0];
+
+	factor.invBlendConstant4W[1][0] =
+	factor.invBlendConstant4W[1][1] =
+	factor.invBlendConstant4W[1][2] =
+	factor.invBlendConstant4W[1][3] = 0xFFFFu - factor.blendConstant4W[1][0];
+
+	factor.invBlendConstant4W[2][0] =
+	factor.invBlendConstant4W[2][1] =
+	factor.invBlendConstant4W[2][2] =
+	factor.invBlendConstant4W[2][3] = 0xFFFFu - factor.blendConstant4W[2][0];
+
+	factor.invBlendConstant4W[3][0] =
+	factor.invBlendConstant4W[3][1] =
+	factor.invBlendConstant4W[3][2] =
+	factor.invBlendConstant4W[3][3] = 0xFFFFu - factor.blendConstant4W[3][0];
+
+	factor.blendConstant4F[0][0] =
+	factor.blendConstant4F[0][1] =
+	factor.blendConstant4F[0][2] =
+	factor.blendConstant4F[0][3] = blendConstant.r;
+
+	factor.blendConstant4F[1][0] =
+	factor.blendConstant4F[1][1] =
+	factor.blendConstant4F[1][2] =
+	factor.blendConstant4F[1][3] = blendConstant.g;
+
+	factor.blendConstant4F[2][0] =
+	factor.blendConstant4F[2][1] =
+	factor.blendConstant4F[2][2] =
+	factor.blendConstant4F[2][3] = blendConstant.b;
+
+	factor.blendConstant4F[3][0] =
+	factor.blendConstant4F[3][1] =
+	factor.blendConstant4F[3][2] =
+	factor.blendConstant4F[3][3] = blendConstant.a;
+
+	factor.invBlendConstant4F[0][0] =
+	factor.invBlendConstant4F[0][1] =
+	factor.invBlendConstant4F[0][2] =
+	factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
+
+	factor.invBlendConstant4F[1][0] =
+	factor.invBlendConstant4F[1][1] =
+	factor.invBlendConstant4F[1][2] =
+	factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
+
+	factor.invBlendConstant4F[2][0] =
+	factor.invBlendConstant4F[2][1] =
+	factor.invBlendConstant4F[2][2] =
+	factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
+
+	factor.invBlendConstant4F[3][0] =
+	factor.invBlendConstant4F[3][1] =
+	factor.invBlendConstant4F[3][2] =
+	factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
+}
+
+void PixelProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+const PixelProcessor::State PixelProcessor::update(const Context* context) const
+{
+	State state;
+
+	state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+	state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
+	if(context->pixelShader)
+	{
+		state.shaderID = context->pixelShader->getSerialID();
+	}
+	else
+	{
+		state.shaderID = 0;
+	}
+
+	state.alphaToCoverage = context->alphaToCoverage;
+	state.depthWriteEnable = context->depthWriteActive();
+
+	if(context->stencilActive())
+	{
+		state.stencilActive = true;
+		state.frontStencil = context->frontStencil;
+		state.backStencil = context->backStencil;
+	}
+
+	if(context->depthBufferActive())
+	{
+		state.depthTestActive = true;
+		state.depthCompareMode = context->depthCompareMode;
+		state.depthFormat = context->depthBuffer->getFormat();
+	}
+
+	state.occlusionEnabled = context->occlusionEnabled;
+	state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
+
+	for(int i = 0; i < RENDERTARGETS; i++)
+	{
+		state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
+		state.targetFormat[i] = context->renderTargetInternalFormat(i);
+		state.blendState[i] = context->getBlendState(i);
+	}
+
+	state.multiSample = static_cast<unsigned int>(context->sampleCount);
+	state.multiSampleMask = context->multiSampleMask;
+	state.multiSampledBresenham = (state.multiSample > 1) && context->isDrawLine(true) &&
+	                              (context->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT);
+
+	if(state.multiSample > 1 && context->pixelShader)
+	{
+		state.centroid = context->pixelShader->getModes().NeedsCentroid;
+	}
+
+	state.frontFace = context->frontFace;
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+PixelProcessor::RoutineType PixelProcessor::routine(const State &state,
+	vk::PipelineLayout const *pipelineLayout,
+	SpirvShader const *pixelShader,
+	const vk::DescriptorSet::Bindings &descriptorSets)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)
+	{
+		QuadRasterizer *generator = new PixelProgram(state, pipelineLayout, pixelShader, descriptorSets);
+		generator->generate();
+		routine = (*generator)("PixelRoutine_%0.8X", state.shaderID);
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+}  // namespace sw

diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
index f657a59..8bc19ab 100644
--- a/src/Device/PixelProcessor.hpp
+++ b/src/Device/PixelProcessor.hpp

@@ -20,148 +20,149 @@
 #include "Memset.hpp"
 #include "RoutineCache.hpp"
 
-namespace sw
+namespace sw {
+
+class PixelShader;
+class Rasterizer;
+struct Texture;
+struct DrawData;
+struct Primitive;
+
+using RasterizerFunction = FunctionT<void(const Primitive* primitive, int count, int cluster, int clusterCount, DrawData* draw)>;
+
+class PixelProcessor
 {
-	class PixelShader;
-	class Rasterizer;
-	struct Texture;
-	struct DrawData;
-	struct Primitive;
-
-	using RasterizerFunction = FunctionT<void(const Primitive* primitive, int count, int cluster, int clusterCount, DrawData* draw)>;
-
-	class PixelProcessor
+public:
+	struct States : Memset<States>
 	{
-	public:
-		struct States : Memset<States>
+		// Same as VkStencilOpState, but with no reference, as it's not part of the state
+		// (it doesn't require a different program to be generated)
+		struct StencilOpState
 		{
-			// Same as VkStencilOpState, but with no reference, as it's not part of the state
-			// (it doesn't require a different program to be generated)
-			struct StencilOpState
+			VkStencilOp    failOp;
+			VkStencilOp    passOp;
+			VkStencilOp    depthFailOp;
+			VkCompareOp    compareOp;
+			uint32_t       compareMask;
+			uint32_t       writeMask;
+
+			void operator=(const VkStencilOpState &rhs)
 			{
-				VkStencilOp    failOp;
-				VkStencilOp    passOp;
-				VkStencilOp    depthFailOp;
-				VkCompareOp    compareOp;
-				uint32_t       compareMask;
-				uint32_t       writeMask;
-
-				void operator=(const VkStencilOpState &rhs)
-				{
-					failOp = rhs.failOp;

-					passOp = rhs.passOp;

-					depthFailOp = rhs.depthFailOp;

-					compareOp = rhs.compareOp;

-					compareMask = rhs.compareMask;

-					writeMask = rhs.writeMask;
-				}
-			};
-
-			States() : Memset(this, 0) {}
-
-			uint32_t computeHash();
-
-			uint64_t shaderID;
-
-			unsigned int numClipDistances;
-			unsigned int numCullDistances;
-
-			VkCompareOp depthCompareMode;
-			bool depthWriteEnable;
-
-			bool stencilActive;
-			StencilOpState frontStencil;
-			StencilOpState backStencil;
-
-			bool depthTestActive;
-			bool occlusionEnabled;
-			bool perspective;
-			bool depthClamp;
-
-			BlendState blendState[RENDERTARGETS];
-
-			unsigned int colorWriteMask;
-			VkFormat targetFormat[RENDERTARGETS];
-			unsigned int multiSample;
-			unsigned int multiSampleMask;
-			bool multiSampledBresenham;
-			bool alphaToCoverage;
-			bool centroid;
-			VkFrontFace frontFace;
-			VkFormat depthFormat;
-		};
-
-		struct State : States
-		{
-			bool operator==(const State &state) const;
-
-			int colorWriteActive(int index) const
-			{
-				return (colorWriteMask >> (index * 4)) & 0xF;
-			}
-
-			uint32_t hash;
-		};
-
-		struct Stencil
-		{
-			int64_t testMaskQ;
-			int64_t referenceMaskedQ;
-			int64_t referenceMaskedSignedQ;
-			int64_t writeMaskQ;
-			int64_t invWriteMaskQ;
-			int64_t referenceQ;
-
-			void set(int reference, int testMask, int writeMask)
-			{
-				referenceQ = replicate(reference);
-				testMaskQ = replicate(testMask);
-				writeMaskQ = replicate(writeMask);
-				invWriteMaskQ = ~writeMaskQ;
-				referenceMaskedQ = referenceQ & testMaskQ;
-				referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
-			}
-
-			static int64_t replicate(int b)
-			{
-				int64_t w = b & 0xFF;
-
-				return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+				failOp = rhs.failOp;

+				passOp = rhs.passOp;

+				depthFailOp = rhs.depthFailOp;

+				compareOp = rhs.compareOp;

+				compareMask = rhs.compareMask;

+				writeMask = rhs.writeMask;
 			}
 		};
 
-		struct Factor
-		{
-			word4 alphaReference4;
+		States() : Memset(this, 0) {}
 
-			word4 blendConstant4W[4];
-			float4 blendConstant4F[4];
-			word4 invBlendConstant4W[4];
-			float4 invBlendConstant4F[4];
-		};
+		uint32_t computeHash();
 
-	public:
-		using RoutineType = RasterizerFunction::RoutineType;
+		uint64_t shaderID;
 
-		PixelProcessor();
+		unsigned int numClipDistances;
+		unsigned int numCullDistances;
 
-		virtual ~PixelProcessor();
+		VkCompareOp depthCompareMode;
+		bool depthWriteEnable;
 
-		void setBlendConstant(const Color<float> &blendConstant);
+		bool stencilActive;
+		StencilOpState frontStencil;
+		StencilOpState backStencil;
 
-	protected:
-		const State update(const Context* context) const;
-		RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                                 SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
-		void setRoutineCacheSize(int routineCacheSize);
+		bool depthTestActive;
+		bool occlusionEnabled;
+		bool perspective;
+		bool depthClamp;
 
-		// Other semi-constants
-		Factor factor;
+		BlendState blendState[RENDERTARGETS];
 
-	private:
-		using RoutineCacheType = RoutineCacheT<State, RasterizerFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		unsigned int colorWriteMask;
+		VkFormat targetFormat[RENDERTARGETS];
+		unsigned int multiSample;
+		unsigned int multiSampleMask;
+		bool multiSampledBresenham;
+		bool alphaToCoverage;
+		bool centroid;
+		VkFrontFace frontFace;
+		VkFormat depthFormat;
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &state) const;
+
+		int colorWriteActive(int index) const
+		{
+			return (colorWriteMask >> (index * 4)) & 0xF;
+		}
+
+		uint32_t hash;
+	};
+
+	struct Stencil
+	{
+		int64_t testMaskQ;
+		int64_t referenceMaskedQ;
+		int64_t referenceMaskedSignedQ;
+		int64_t writeMaskQ;
+		int64_t invWriteMaskQ;
+		int64_t referenceQ;
+
+		void set(int reference, int testMask, int writeMask)
+		{
+			referenceQ = replicate(reference);
+			testMaskQ = replicate(testMask);
+			writeMaskQ = replicate(writeMask);
+			invWriteMaskQ = ~writeMaskQ;
+			referenceMaskedQ = referenceQ & testMaskQ;
+			referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
+		}
+
+		static int64_t replicate(int b)
+		{
+			int64_t w = b & 0xFF;
+
+			return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+		}
+	};
+
+	struct Factor
+	{
+		word4 alphaReference4;
+
+		word4 blendConstant4W[4];
+		float4 blendConstant4F[4];
+		word4 invBlendConstant4W[4];
+		float4 invBlendConstant4F[4];
+	};
+
+public:
+	using RoutineType = RasterizerFunction::RoutineType;
+
+	PixelProcessor();
+
+	virtual ~PixelProcessor();
+
+	void setBlendConstant(const Color<float> &blendConstant);
+
+protected:
+	const State update(const Context* context) const;
+	RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+	                    SpirvShader const *pixelShader, const vk::DescriptorSet::Bindings &descriptorSets);
+	void setRoutineCacheSize(int routineCacheSize);
+
+	// Other semi-constants
+	Factor factor;
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, RasterizerFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_PixelProcessor_hpp

diff --git a/src/Device/Plane.cpp b/src/Device/Plane.cpp
index 095b7f2..8a89546 100644
--- a/src/Device/Plane.cpp
+++ b/src/Device/Plane.cpp

@@ -16,45 +16,46 @@
 
 #include "Matrix.hpp"
 
-namespace sw
+namespace sw {
+
+Plane::Plane()
 {
-	Plane::Plane()
-	{
-	}
-
-	Plane::Plane(float p_A, float p_B, float p_C, float p_D)
-	{
-		A = p_A;
-		B = p_B;
-		C = p_C;
-		D = p_D;
-	}
-
-	Plane::Plane(const float ABCD[4])
-	{
-		A = ABCD[0];
-		B = ABCD[1];
-		C = ABCD[2];
-		D = ABCD[3];
-	}
-
-	Plane operator*(const Plane &p, const Matrix &T)
-	{
-		Matrix M = !T;
-
-		return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
-		             p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
-		             p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
-		             p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
-	}
-
-	Plane operator*(const Matrix &T, const Plane &p)
-	{
-		Matrix M = !T;
-
-		return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
-		             M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
-		             M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
-		             M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
-	}
 }
+
+Plane::Plane(float p_A, float p_B, float p_C, float p_D)
+{
+	A = p_A;
+	B = p_B;
+	C = p_C;
+	D = p_D;
+}
+
+Plane::Plane(const float ABCD[4])
+{
+	A = ABCD[0];
+	B = ABCD[1];
+	C = ABCD[2];
+	D = ABCD[3];
+}
+
+Plane operator*(const Plane &p, const Matrix &T)
+{
+	Matrix M = !T;
+
+	return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
+	             p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
+	             p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
+	             p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
+}
+
+Plane operator*(const Matrix &T, const Plane &p)
+{
+	Matrix M = !T;
+
+	return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
+	             M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
+	             M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
+	             M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
+}
+
+}  // namespace sw

diff --git a/src/Device/Plane.hpp b/src/Device/Plane.hpp
index 962b9ae..dcce294 100644
--- a/src/Device/Plane.hpp
+++ b/src/Device/Plane.hpp

@@ -17,24 +17,25 @@
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+struct Matrix;
+
+struct Plane
 {
-	struct Matrix;
+	float A;
+	float B;
+	float C;
+	float D;
 
-	struct Plane
-	{
-		float A;
-		float B;
-		float C;
-		float D;
+	Plane();
+	Plane(float A, float B, float C, float D);   // Plane equation 
+	Plane(const float ABCD[4]);
 
-		Plane();
-		Plane(float A, float B, float C, float D);   // Plane equation 
-		Plane(const float ABCD[4]);
+	friend Plane operator*(const Plane &p, const Matrix &A);   // Transform plane by matrix (post-multiply)
+	friend Plane operator*(const Matrix &A, const Plane &p);   // Transform plane by matrix (pre-multiply)
+};
 
-		friend Plane operator*(const Plane &p, const Matrix &A);   // Transform plane by matrix (post-multiply)
-		friend Plane operator*(const Matrix &A, const Plane &p);   // Transform plane by matrix (pre-multiply)
-	};
-}
+}  // namespace sw
 
 #endif   // Plane_hpp

diff --git a/src/Device/Point.cpp b/src/Device/Point.cpp
index e7e33dd..a93616d 100644
--- a/src/Device/Point.cpp
+++ b/src/Device/Point.cpp

@@ -16,77 +16,78 @@
 
 #include "Matrix.hpp"
 
-namespace sw
+namespace sw {
+
+Point &Point::operator+=(const Vector &v)
 {
-	Point &Point::operator+=(const Vector &v)
-	{
-		x += v.x;
-		y += v.y;
-		z += v.z;
+	x += v.x;
+	y += v.y;
+	z += v.z;
 
-		return *this;
-	}
-
-	Point &Point::operator-=(const Vector &v)
-	{
-		x -= v.x;
-		y -= v.y;
-		z -= v.z;
-
-		return *this;
-	}
-
-	Point operator+(const Point &P, const Vector &v)
-	{
-		return Point(P.x + v.x, P.y + v.y, P.z + v.z);
-	}
-
-	Point operator-(const Point &P, const Vector &v)
-	{
-		return Point(P.x - v.x, P.y - v.y, P.z - v.z);
-	}
-
-	Vector operator-(const Point &P, const Point &Q)
-	{
-		return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
-	}
-
-	Point operator*(const Matrix &M, const Point &P)
-	{
-		return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
-		             M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
-		             M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
-	}
-
-	Point operator*(const Point &P, const Matrix &M)
-	{
-		return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
-		             P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
-		             P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
-	}
-
-	Point &operator*=(Point &P, const Matrix &M)
-	{
-		return P = P * M;
-	}
-
-	float Point::d(const Point &P) const
-	{
-		return Vector::N(*this - P);
-	}
-
-	float Point::d2(const Point &P) const
-	{
-		return Vector::N2(*this - P);
-	}
-
-	float Point::d(const Point &P, const Point &Q)
-	{
-		return Vector::N(P - Q);
-	}
-
-	float Point::d2(const Point &P, const Point &Q)
-	{
-		return Vector::N2(P - Q);
-	}
+	return *this;
 }
+
+Point &Point::operator-=(const Vector &v)
+{
+	x -= v.x;
+	y -= v.y;
+	z -= v.z;
+
+	return *this;
+}
+
+Point operator+(const Point &P, const Vector &v)
+{
+	return Point(P.x + v.x, P.y + v.y, P.z + v.z);
+}
+
+Point operator-(const Point &P, const Vector &v)
+{
+	return Point(P.x - v.x, P.y - v.y, P.z - v.z);
+}
+
+Vector operator-(const Point &P, const Point &Q)
+{
+	return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
+}
+
+Point operator*(const Matrix &M, const Point &P)
+{
+	return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
+	             M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
+	             M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
+}
+
+Point operator*(const Point &P, const Matrix &M)
+{
+	return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
+	             P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
+	             P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
+}
+
+Point &operator*=(Point &P, const Matrix &M)
+{
+	return P = P * M;
+}
+
+float Point::d(const Point &P) const
+{
+	return Vector::N(*this - P);
+}
+
+float Point::d2(const Point &P) const
+{
+	return Vector::N2(*this - P);
+}
+
+float Point::d(const Point &P, const Point &Q)
+{
+	return Vector::N(P - Q);
+}
+
+float Point::d2(const Point &P, const Point &Q)
+{
+	return Vector::N2(P - Q);
+}
+
+}  // namespace sw

diff --git a/src/Device/Point.hpp b/src/Device/Point.hpp
index 85198c5..5602209 100644
--- a/src/Device/Point.hpp
+++ b/src/Device/Point.hpp

@@ -15,125 +15,127 @@
 #ifndef Point_hpp
 #define Point_hpp
 
-namespace sw
+namespace sw {
+
+struct Vector;
+struct Matrix;
+
+struct Point
 {
-	struct Vector;
-	struct Matrix;
+	Point();
+	Point(const int i);
+	Point(const Point &P);
+	Point(const Vector &v);
+	Point(float Px, float Py, float Pz);
 
-	struct Point
+	Point &operator=(const Point &P);
+
+	union
 	{
-		Point();
-		Point(const int i);
-		Point(const Point &P);
-		Point(const Vector &v);
-		Point(float Px, float Py, float Pz);
+		float p[3];
 
-		Point &operator=(const Point &P);
-
-		union
-		{
-			float p[3];
-
-			struct
-			{	
-				float x;
-				float y;
-				float z;
-			};
+		struct
+		{	
+			float x;
+			float y;
+			float z;
 		};
-
-		float &operator[](int i);
-		float &operator()(int i);
-
-		const float &operator[](int i) const;
-		const float &operator()(int i) const;
-
-		Point &operator+=(const Vector &v);
-		Point &operator-=(const Vector &v);
-
-		friend Point operator+(const Point &P, const Vector &v);
-		friend Point operator-(const Point &P, const Vector &v);
-
-		friend Vector operator-(const Point &P, const Point &Q);
-
-		friend Point operator*(const Matrix &M, const Point& P);
-		friend Point operator*(const Point &P, const Matrix &M);
-		friend Point &operator*=(Point &P, const Matrix &M);
-
-		float d(const Point &P) const;   // Distance between two points
-		float d2(const Point &P) const;   // Squared distance between two points
-
-		static float d(const Point &P, const Point &Q);   // Distance between two points
-		static float d2(const Point &P, const Point &Q);   // Squared distance between two points
 	};
-}
+
+	float &operator[](int i);
+	float &operator()(int i);
+
+	const float &operator[](int i) const;
+	const float &operator()(int i) const;
+
+	Point &operator+=(const Vector &v);
+	Point &operator-=(const Vector &v);
+
+	friend Point operator+(const Point &P, const Vector &v);
+	friend Point operator-(const Point &P, const Vector &v);
+
+	friend Vector operator-(const Point &P, const Point &Q);
+
+	friend Point operator*(const Matrix &M, const Point& P);
+	friend Point operator*(const Point &P, const Matrix &M);
+	friend Point &operator*=(Point &P, const Matrix &M);
+
+	float d(const Point &P) const;   // Distance between two points
+	float d2(const Point &P) const;   // Squared distance between two points
+
+	static float d(const Point &P, const Point &Q);   // Distance between two points
+	static float d2(const Point &P, const Point &Q);   // Squared distance between two points
+};
+
+}  // namespace sw
 
 #include "Vector.hpp"
 
-namespace sw
+namespace sw {
+
+inline Point::Point()
 {
-	inline Point::Point()
-	{
-	}
-
-	inline Point::Point(const int i)
-	{
-		const float s = (float)i;
-
-		x = s;
-		y = s;
-		z = s;
-	}
-
-	inline Point::Point(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-	}
-
-	inline Point::Point(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-	}
-
-	inline Point::Point(float P_x, float P_y, float P_z)
-	{
-		x = P_x;
-		y = P_y;
-		z = P_z;
-	}
-
-	inline Point &Point::operator=(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-
-		return *this;
-	}
-
-	inline float &Point::operator()(int i)
-	{
-		return p[i];
-	}
-
-	inline float &Point::operator[](int i)
-	{
-		return p[i];
-	}
-
-	inline const float &Point::operator()(int i) const
-	{
-		return p[i];
-	}
-
-	inline const float &Point::operator[](int i) const
-	{
-		return p[i];
-	}
 }
 
+inline Point::Point(const int i)
+{
+	const float s = (float)i;
+
+	x = s;
+	y = s;
+	z = s;
+}
+
+inline Point::Point(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+}
+
+inline Point::Point(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+}
+
+inline Point::Point(float P_x, float P_y, float P_z)
+{
+	x = P_x;
+	y = P_y;
+	z = P_z;
+}
+
+inline Point &Point::operator=(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+
+	return *this;
+}
+
+inline float &Point::operator()(int i)
+{
+	return p[i];
+}
+
+inline float &Point::operator[](int i)
+{
+	return p[i];
+}
+
+inline const float &Point::operator()(int i) const
+{
+	return p[i];
+}
+
+inline const float &Point::operator[](int i) const
+{
+	return p[i];
+}
+
+}  // namespace sw
+
 #endif   // Point_hpp

diff --git a/src/Device/Polygon.hpp b/src/Device/Polygon.hpp
index 8ee8562..5412128 100644
--- a/src/Device/Polygon.hpp
+++ b/src/Device/Polygon.hpp

@@ -17,40 +17,41 @@
 
 #include "Vertex.hpp"
 
-namespace sw
+namespace sw {
+
+struct Polygon
 {
-	struct Polygon
+	Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
 	{
-		Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
-		{
-			P[0][0] = P0;
-			P[0][1] = P1;
-			P[0][2] = P2;
+		P[0][0] = P0;
+		P[0][1] = P1;
+		P[0][2] = P2;
 
-			n = 3;
-			i = 0;
-			b = 0;
+		n = 3;
+		i = 0;
+		b = 0;
+	}
+
+	Polygon(const float4 *P, int n)
+	{
+		for(int i = 0; i < n; i++)
+		{
+			this->P[0][i] = &P[i];
 		}
 
-		Polygon(const float4 *P, int n)
-		{
-			for(int i = 0; i < n; i++)
-			{
-				this->P[0][i] = &P[i];
-			}
+		this->n = n;
+		this->i = 0;
+		this->b = 0;
+	}
 
-			this->n = n;
-			this->i = 0;
-			this->b = 0;
-		}
+	float4 B[16];              // Buffer for clipped vertices
+	const float4 *P[16][16];   // Pointers to clipped polygon's vertices
 
-		float4 B[16];              // Buffer for clipped vertices
-		const float4 *P[16][16];   // Pointers to clipped polygon's vertices
+	int n;   // Number of vertices
+	int i;   // Level of P to use
+	int b;   // Next available new vertex
+};
 
-		int n;   // Number of vertices
-		int i;   // Level of P to use
-		int b;   // Next available new vertex
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Polygon_hpp

diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index 45b2e42..85a9db4 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp

@@ -20,70 +20,71 @@
 #include "Device/Config.hpp"
 #include "System/Build.hpp"
 
-namespace sw
+namespace sw {
+
+struct Triangle MEMORY_SANITIZER_ONLY(: Memset<Triangle>)
 {
-	struct Triangle MEMORY_SANITIZER_ONLY(: Memset<Triangle>)
-	{
 #if MEMORY_SANITIZER_ENABLED
-		// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
-		// false-positives when read. By clearing the struct in the constructor,
-		// we can avoid triggering these false-positives.
-		inline Triangle() : Memset<Triangle>(this, 0) {}
+	// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
+	// false-positives when read. By clearing the struct in the constructor,
+	// we can avoid triggering these false-positives.
+	inline Triangle() : Memset<Triangle>(this, 0) {}
 #endif // MEMORY_SANITIZER_ENABLED
 
-		Vertex v0;
-		Vertex v1;
-		Vertex v2;
-	};
+	Vertex v0;
+	Vertex v1;
+	Vertex v2;
+};
 
-	struct PlaneEquation   // z = A * x + B * y + C
-	{
-		float4 A;
-		float4 B;
-		float4 C;
-	};
+struct PlaneEquation   // z = A * x + B * y + C
+{
+	float4 A;
+	float4 B;
+	float4 C;
+};
 
-	struct Primitive MEMORY_SANITIZER_ONLY(: Memset<Primitive>)
-	{
+struct Primitive MEMORY_SANITIZER_ONLY(: Memset<Primitive>)
+{
 #if MEMORY_SANITIZER_ENABLED
-		// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
-		// false-positives when read. By clearing the struct in the constructor,
-		// we can avoid triggering these false-positives.
-		inline Primitive() : Memset<Primitive>(this, 0) {}
+	// Memory sanitizer cannot 'see' writes from JIT'd code, and can raise
+	// false-positives when read. By clearing the struct in the constructor,
+	// we can avoid triggering these false-positives.
+	inline Primitive() : Memset<Primitive>(this, 0) {}
 #endif // MEMORY_SANITIZER_ENABLED
 
-		int yMin;
-		int yMax;
+	int yMin;
+	int yMax;
 
-		float4 xQuad;
-		float4 yQuad;
+	float4 xQuad;
+	float4 yQuad;
 
-		float pointCoordX;
-		float pointCoordY;
+	float pointCoordX;
+	float pointCoordY;
 
-		PlaneEquation z;
-		PlaneEquation w;
-		PlaneEquation V[MAX_INTERFACE_COMPONENTS];
+	PlaneEquation z;
+	PlaneEquation w;
+	PlaneEquation V[MAX_INTERFACE_COMPONENTS];
 
-		PlaneEquation clipDistance[MAX_CLIP_DISTANCES];
-		PlaneEquation cullDistance[MAX_CULL_DISTANCES];
+	PlaneEquation clipDistance[MAX_CLIP_DISTANCES];
+	PlaneEquation cullDistance[MAX_CULL_DISTANCES];
 
-		// Masks for two-sided stencil
-		int64_t clockwiseMask;
-		int64_t invClockwiseMask;
+	// Masks for two-sided stencil
+	int64_t clockwiseMask;
+	int64_t invClockwiseMask;
 
-		struct Span
-		{
-			unsigned short left;
-			unsigned short right;
-		};
-
-		// The rasterizer adds a zero length span to the top and bottom of the polygon to allow
-		// for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
-		Span outlineUnderflow[2];
-		Span outline[OUTLINE_RESOLUTION];
-		Span outlineOverflow[2];
+	struct Span
+	{
+		unsigned short left;
+		unsigned short right;
 	};
-}
+
+	// The rasterizer adds a zero length span to the top and bottom of the polygon to allow
+	// for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
+	Span outlineUnderflow[2];
+	Span outline[OUTLINE_RESOLUTION];
+	Span outlineOverflow[2];
+};
+
+}  // namespace sw
 
 #endif   // sw_Primitive_hpp

diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index 0f23599..10b9e73 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp

@@ -20,246 +20,247 @@
 #include "System/Math.hpp"
 #include "Vulkan/VkDebug.hpp"
 
-namespace sw
+namespace sw {
+
+QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader) : state(state), spirvShader{spirvShader}
 {
-	QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader) : state(state), spirvShader{spirvShader}
-	{
-	}
+}
 
-	QuadRasterizer::~QuadRasterizer()
-	{
-	}
+QuadRasterizer::~QuadRasterizer()
+{
+}
 
-	void QuadRasterizer::generate()
-	{
-		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
-		occlusion = 0;
+void QuadRasterizer::generate()
+{
+	constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+	occlusion = 0;
 
-		Do
+	Do
+	{
+		Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
+		Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+
+		Int cluster2 = cluster + cluster;
+		yMin += clusterCount * 2 - 2 - cluster2;
+		yMin &= -clusterCount * 2;
+		yMin += cluster2;
+
+		If(yMin < yMax)
 		{
-			Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
-			Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+			rasterize(yMin, yMax);
+		}
 
-			Int cluster2 = cluster + cluster;
-			yMin += clusterCount * 2 - 2 - cluster2;
-			yMin &= -clusterCount * 2;
-			yMin += cluster2;
+		primitive += sizeof(Primitive) * state.multiSample;
+		count--;
+	}
+	Until(count == 0)
 
-			If(yMin < yMax)
+	if(state.occlusionEnabled)
+	{
+		UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
+		clusterOcclusion += occlusion;
+		*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
+	}
+
+	Return();
+}
+
+void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
+{
+	Pointer<Byte> cBuffer[RENDERTARGETS];
+	Pointer<Byte> zBuffer;
+	Pointer<Byte> sBuffer;
+
+	Int clusterCountLog2 = 31 - Ctlz(UInt(clusterCount), false);
+
+	for(int index = 0; index < RENDERTARGETS; index++)
+	{
+		if(state.colorWriteActive(index))
+		{
+			cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+		}
+	}
+
+	if(state.depthTestActive)
+	{
+		zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+	}
+
+	if(state.stencilActive)
+	{
+		sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+	}
+
+	Int y = yMin;
+
+	Do
+	{
+		Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+		Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+		Int x0 = Min(x0a, x0b);
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+			x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+			x0 = Min(x0, Min(x0a, x0b));
+		}
+
+		x0 &= 0xFFFFFFFE;
+
+		Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+		Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+		Int x1 = Max(x1a, x1b);
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+			x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+			x1 = Max(x1, Max(x1a, x1b));
+		}
+
+		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+		if(interpolateZ())
+		{
+			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
-				rasterize(yMin, yMax);
+				Float4 y = yyyy;
+
+				if(state.multiSample > 1)
+				{
+					y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
+				}
+
+				Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
+			}
+		}
+
+		If(x0 < x1)
+		{
+			if(interpolateW())
+			{
+				Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
 			}
 
-			primitive += sizeof(Primitive) * state.multiSample;
-			count--;
+			if (spirvShader)
+			{
+				for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+				{
+					if (spirvShader->inputs[interpolant].Type == SpirvShader::ATTRIBTYPE_UNUSED)
+						continue;
+
+					Dv[interpolant] = *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].C), 16);
+					if (!spirvShader->inputs[interpolant].Flat)
+					{
+						Dv[interpolant] +=
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].B), 16);
+					}
+				}
+
+				for (unsigned int i = 0; i < state.numClipDistances; i++)
+				{
+					DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
+				}
+
+				for (unsigned int i = 0; i < state.numCullDistances; i++)
+				{
+					DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
+								yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
+				}
+			}
+
+			Short4 xLeft[4];
+			Short4 xRight[4];
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
+				xRight[q] = xLeft[q];
+
+				xLeft[q] = Swizzle(xLeft[q], 0x0022) - Short4(1, 2, 1, 2);
+				xRight[q] = Swizzle(xRight[q], 0x1133) - Short4(0, 1, 0, 1);
+			}
+
+			For(Int x = x0, x < x1, x += 2)
+			{
+				Short4 xxxx = Short4(x);
+				Int cMask[4];
+
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					if (state.multiSampleMask & (1<<q))
+					{
+						unsigned int i = state.multiSampledBresenham ? 0 : q;
+						Short4 mask = CmpGT(xxxx, xLeft[i]) & CmpGT(xRight[i], xxxx);
+						cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
+					}
+					else
+					{
+						cMask[q] = 0;
+					}
+				}
+
+				quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
+			}
 		}
-		Until(count == 0)
-
-		if(state.occlusionEnabled)
-		{
-			UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
-			clusterOcclusion += occlusion;
-			*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
-		}
-
-		Return();
-	}
-
-	void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
-	{
-		Pointer<Byte> cBuffer[RENDERTARGETS];
-		Pointer<Byte> zBuffer;
-		Pointer<Byte> sBuffer;
-
-		Int clusterCountLog2 = 31 - Ctlz(UInt(clusterCount), false);
 
 		for(int index = 0; index < RENDERTARGETS; index++)
 		{
 			if(state.colorWriteActive(index))
 			{
-				cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + clusterCountLog2);   // FIXME: Precompute
 			}
 		}
 
 		if(state.depthTestActive)
 		{
-			zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+			zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 		}
 
 		if(state.stencilActive)
 		{
-			sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+			sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
 		}
 
-		Int y = yMin;
-
-		Do
-		{
-			Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
-			Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
-			Int x0 = Min(x0a, x0b);
-
-			for(unsigned int q = 1; q < state.multiSample; q++)
-			{
-				x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
-				x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
-				x0 = Min(x0, Min(x0a, x0b));
-			}
-
-			x0 &= 0xFFFFFFFE;
-
-			Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
-			Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
-			Int x1 = Max(x1a, x1b);
-
-			for(unsigned int q = 1; q < state.multiSample; q++)
-			{
-				x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
-				x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
-				x1 = Max(x1, Max(x1a, x1b));
-			}
-
-			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
-
-			if(interpolateZ())
-			{
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					Float4 y = yyyy;
-
-					if(state.multiSample > 1)
-					{
-						y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
-					}
-
-					Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
-				}
-			}
-
-			If(x0 < x1)
-			{
-				if(interpolateW())
-				{
-					Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
-				}
-
-				if (spirvShader)
-				{
-					for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
-					{
-						if (spirvShader->inputs[interpolant].Type == SpirvShader::ATTRIBTYPE_UNUSED)
-							continue;
-
-						Dv[interpolant] = *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].C), 16);
-						if (!spirvShader->inputs[interpolant].Flat)
-						{
-							Dv[interpolant] +=
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].B), 16);
-						}
-					}
-
-					for (unsigned int i = 0; i < state.numClipDistances; i++)
-					{
-						DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
-					}
-
-					for (unsigned int i = 0; i < state.numCullDistances; i++)
-					{
-						DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
-									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
-					}
-				}
-
-				Short4 xLeft[4];
-				Short4 xRight[4];
-
-				for(unsigned int q = 0; q < state.multiSample; q++)
-				{
-					xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
-					xRight[q] = xLeft[q];
-
-					xLeft[q] = Swizzle(xLeft[q], 0x0022) - Short4(1, 2, 1, 2);
-					xRight[q] = Swizzle(xRight[q], 0x1133) - Short4(0, 1, 0, 1);
-				}
-
-				For(Int x = x0, x < x1, x += 2)
-				{
-					Short4 xxxx = Short4(x);
-					Int cMask[4];
-
-					for(unsigned int q = 0; q < state.multiSample; q++)
-					{
-						if (state.multiSampleMask & (1<<q))
-						{
-							unsigned int i = state.multiSampledBresenham ? 0 : q;
-							Short4 mask = CmpGT(xxxx, xLeft[i]) & CmpGT(xRight[i], xxxx);
-							cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
-						}
-						else
-						{
-							cMask[q] = 0;
-						}
-					}
-
-					quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
-				}
-			}
-
-			for(int index = 0; index < RENDERTARGETS; index++)
-			{
-				if(state.colorWriteActive(index))
-				{
-					cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + clusterCountLog2);   // FIXME: Precompute
-				}
-			}
-
-			if(state.depthTestActive)
-			{
-				zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
-			}
-
-			if(state.stencilActive)
-			{
-				sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + clusterCountLog2);   // FIXME: Precompute
-			}
-
-			y += 2 * clusterCount;
-		}
-		Until(y >= yMax)
+		y += 2 * clusterCount;
 	}
-
-	Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
-	{
-		Float4 interpolant = D;
-
-		if(!flat)
-		{
-			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
-
-			if(perspective)
-			{
-				interpolant *= rhw;
-			}
-		}
-
-		if(clamp)
-		{
-			interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
-		}
-
-		return interpolant;
-	}
-
-	bool QuadRasterizer::interpolateZ() const
-	{
-		return state.depthTestActive || (spirvShader && spirvShader->hasBuiltinInput(spv::BuiltInFragCoord));
-	}
-
-	bool QuadRasterizer::interpolateW() const
-	{
-		// Note: could optimize cases where there is a fragment shader but it has no
-		// perspective-correct inputs, but that's vanishingly rare.
-		return spirvShader != nullptr;
-	}
+	Until(y >= yMax)
 }
+
+Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
+{
+	Float4 interpolant = D;
+
+	if(!flat)
+	{
+		interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
+
+		if(perspective)
+		{
+			interpolant *= rhw;
+		}
+	}
+
+	if(clamp)
+	{
+		interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
+	}
+
+	return interpolant;
+}
+
+bool QuadRasterizer::interpolateZ() const
+{
+	return state.depthTestActive || (spirvShader && spirvShader->hasBuiltinInput(spv::BuiltInFragCoord));
+}
+
+bool QuadRasterizer::interpolateW() const
+{
+	// Note: could optimize cases where there is a fragment shader but it has no
+	// perspective-correct inputs, but that's vanishingly rare.
+	return spirvShader != nullptr;
+}
+
+}  // namespace sw

diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
index 6d349e7..0311d8a 100644
--- a/src/Device/QuadRasterizer.hpp
+++ b/src/Device/QuadRasterizer.hpp

@@ -20,40 +20,41 @@
 #include "Pipeline/SpirvShader.hpp"
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+class QuadRasterizer : public Rasterizer
 {
-	class QuadRasterizer : public Rasterizer
-	{
-	public:
-		QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader);
-		virtual ~QuadRasterizer();
+public:
+	QuadRasterizer(const PixelProcessor::State &state, SpirvShader const *spirvShader);
+	virtual ~QuadRasterizer();
 
-		void generate();
+	void generate();
 
-	protected:
-		Pointer<Byte> constants;
+protected:
+	Pointer<Byte> constants;
 
-		Float4 Dz[4];
-		Float4 Dw;
-		Float4 Dv[MAX_INTERFACE_COMPONENTS];
-		Float4 Df;
-		Float4 DclipDistance[MAX_CLIP_DISTANCES];
-		Float4 DcullDistance[MAX_CULL_DISTANCES];
+	Float4 Dz[4];
+	Float4 Dw;
+	Float4 Dv[MAX_INTERFACE_COMPONENTS];
+	Float4 Df;
+	Float4 DclipDistance[MAX_CLIP_DISTANCES];
+	Float4 DcullDistance[MAX_CULL_DISTANCES];
 
-		UInt occlusion;
+	UInt occlusion;
 
-		virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
+	virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
 
-		bool interpolateZ() const;
-		bool interpolateW() const;
-		Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
+	bool interpolateZ() const;
+	bool interpolateW() const;
+	Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
 
-		const PixelProcessor::State &state;
-		const SpirvShader *const spirvShader;
+	const PixelProcessor::State &state;
+	const SpirvShader *const spirvShader;
 
-	private:
-		void rasterize(Int &yMin, Int &yMax);
-	};
-}
+private:
+	void rasterize(Int &yMin, Int &yMax);
+};
+
+}  // namespace sw
 
 #endif   // sw_QuadRasterizer_hpp

diff --git a/src/Device/Rasterizer.hpp b/src/Device/Rasterizer.hpp
index 4e64e0e..cf229e6 100644
--- a/src/Device/Rasterizer.hpp
+++ b/src/Device/Rasterizer.hpp

@@ -19,21 +19,22 @@
 #include "PixelProcessor.hpp"
 #include "Device/Config.hpp"
 
-namespace sw
-{
-	class Rasterizer : public RasterizerFunction
-	{
-	public:
-		Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), clusterCount(Arg<3>()), data(Arg<4>()) {}
-		virtual ~Rasterizer() {}
+namespace sw {
 
-	protected:
-		Pointer<Byte> primitive;
-		Int count;
-		Int cluster;
-		Int clusterCount;
-		Pointer<Byte> data;
-	};
-}
+class Rasterizer : public RasterizerFunction
+{
+public:
+	Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), clusterCount(Arg<3>()), data(Arg<4>()) {}
+	virtual ~Rasterizer() {}
+
+protected:
+	Pointer<Byte> primitive;
+	Int count;
+	Int cluster;
+	Int clusterCount;
+	Pointer<Byte> data;
+};
+
+}  // namespace sw
 
 #endif   // sw_Rasterizer_hpp

diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 46bed2a..776eb13 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp

@@ -43,1097 +43,832 @@
 unsigned int maxPrimitives = 1 << 21;
 #endif
 
-namespace sw
+namespace sw {
+
+template<typename T>
+inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, VkProvokingVertexModeEXT provokingVertexMode, T indices, unsigned int start, unsigned int triangleCount)
 {
-	template<typename T>
-	inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, VkProvokingVertexModeEXT provokingVertexMode, T indices, unsigned int start, unsigned int triangleCount)
+	bool provokeFirst = (provokingVertexMode == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT);
+
+	switch(topology)
 	{
-		bool provokeFirst = (provokingVertexMode == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT);
-
-		switch(topology)
+	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+	{
+		auto index = start;
+		auto pointBatch = &(batch[0][0]);
+		for(unsigned int i = 0; i < triangleCount; i++)
 		{
-		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-		{
-			auto index = start;
-			auto pointBatch = &(batch[0][0]);
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				*pointBatch++ = indices[index++];
-			}
-
-			// Repeat the last index to allow for SIMD width overrun.
-			index--;
-			for(unsigned int i = 0; i < 3; i++)
-			{
-				*pointBatch++ = indices[index];
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-		{
-			auto index = 2 * start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + 1];
-
-				index += 2;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-		{
-			auto index = start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + 1];
-
-				index += 1;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-		{
-			auto index = 3 * start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
-				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + (provokeFirst ? 2 : 1)];
-
-				index += 3;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-		{
-			auto index = start;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
-				batch[i][1] = indices[index + ((start + i) & 1) + (provokeFirst ? 1 : 0)];
-				batch[i][2] = indices[index + (~(start + i) & 1) + (provokeFirst ? 1 : 0)];
-
-				index += 1;
-			}
-			break;
-		}
-		case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-		{
-			auto index = start + 1;
-			for(unsigned int i = 0; i < triangleCount; i++)
-			{
-				batch[i][provokeFirst ? 0 : 2] = indices[index + 0];
-				batch[i][provokeFirst ? 1 : 0] = indices[index + 1];
-				batch[i][provokeFirst ? 2 : 1] = indices[0];
-
-				index += 1;
-			}
-			break;
-		}
-		default:
-			ASSERT(false);
-			return false;
+			*pointBatch++ = indices[index++];
 		}
 
-		return true;
+		// Repeat the last index to allow for SIMD width overrun.
+		index--;
+		for(unsigned int i = 0; i < 3; i++)
+		{
+			*pointBatch++ = indices[index];
+		}
+		break;
 	}
-
-	DrawCall::DrawCall()
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
 	{
-		data = (DrawData*)allocate(sizeof(DrawData));
-		data->constants = &constants;
+		auto index = 2 * start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + 1];
+
+			index += 2;
+		}
+		break;
 	}
-
-	DrawCall::~DrawCall()
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
 	{
-		deallocate(data);
+		auto index = start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + 1];
+
+			index += 1;
+		}
+		break;
 	}
-
-	Renderer::Renderer(vk::Device* device) : device(device)
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
 	{
-		VertexProcessor::setRoutineCacheSize(1024);
-		PixelProcessor::setRoutineCacheSize(1024);
-		SetupProcessor::setRoutineCacheSize(1024);
+		auto index = 3 * start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
+			batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + (provokeFirst ? 2 : 1)];
+
+			index += 3;
+		}
+		break;
 	}
-
-	Renderer::~Renderer()
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
 	{
-		drawTickets.take().wait();
+		auto index = start;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
+			batch[i][1] = indices[index + ((start + i) & 1) + (provokeFirst ? 1 : 0)];
+			batch[i][2] = indices[index + (~(start + i) & 1) + (provokeFirst ? 1 : 0)];
+
+			index += 1;
+		}
+		break;
 	}
-
-	// Renderer objects have to be mem aligned to the alignment provided in the class declaration
-	void* Renderer::operator new(size_t size)
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
 	{
-		ASSERT(size == sizeof(Renderer));  // This operator can't be called from a derived class
-		return vk::allocate(sizeof(Renderer), alignof(Renderer), vk::DEVICE_MEMORY, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+		auto index = start + 1;
+		for(unsigned int i = 0; i < triangleCount; i++)
+		{
+			batch[i][provokeFirst ? 0 : 2] = indices[index + 0];
+			batch[i][provokeFirst ? 1 : 0] = indices[index + 1];
+			batch[i][provokeFirst ? 2 : 1] = indices[0];
+
+			index += 1;
+		}
+		break;
 	}
-
-	void Renderer::operator delete(void* mem)
-	{
-		vk::deallocate(mem, vk::DEVICE_MEMORY);
-	}
-
-	void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
-			TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
-			PushConstantStorage const & pushConstants, bool update)
-	{
-		if(count == 0) { return; }
-
-		auto id = nextDrawID++;
-		MARL_SCOPED_EVENT("draw %d", id);
-
-		#ifndef NDEBUG
-		{
-			unsigned int minPrimitives = 1;
-			unsigned int maxPrimitives = 1 << 21;
-			if(count < minPrimitives || count > maxPrimitives)
-			{
-				return;
-			}
-		}
-		#endif
-
-		int ms = context->sampleCount;
-
-		if(!context->multiSampleMask)
-		{
-			return;
-		}
-
-		marl::Pool<sw::DrawCall>::Loan draw;
-		{
-			MARL_SCOPED_EVENT("drawCallPool.borrow()");
-			draw = drawCallPool.borrow();
-		}
-		draw->id = id;
-
-		if(update)
-		{
-			MARL_SCOPED_EVENT("update");
-			vertexState = VertexProcessor::update(context);
-			setupState = SetupProcessor::update(context);
-			pixelState = PixelProcessor::update(context);
-
-			vertexRoutine = VertexProcessor::routine(vertexState, context->pipelineLayout, context->vertexShader, context->descriptorSets);
-			setupRoutine = SetupProcessor::routine(setupState);
-			pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
-		}
-
-		DrawCall::SetupFunction setupPrimitives = nullptr;
-		unsigned int numPrimitivesPerBatch = MaxBatchSize / ms;
-
-		if(context->isDrawTriangle(false))
-		{
-			switch(context->polygonMode)
-			{
-			case VK_POLYGON_MODE_FILL:
-				setupPrimitives = &DrawCall::setupSolidTriangles;
-				break;
-			case VK_POLYGON_MODE_LINE:
-				setupPrimitives = &DrawCall::setupWireframeTriangles;
-				numPrimitivesPerBatch /= 3;
-				break;
-			case VK_POLYGON_MODE_POINT:
-				setupPrimitives = &DrawCall::setupPointTriangles;
-				numPrimitivesPerBatch /= 3;
-				break;
-			default:
-				UNSUPPORTED("polygon mode: %d", int(context->polygonMode));
-				return;
-			}
-		}
-		else if(context->isDrawLine(false))
-		{
-			setupPrimitives = &DrawCall::setupLines;
-		}
-		else  // Point primitive topology
-		{
-			setupPrimitives = &DrawCall::setupPoints;
-		}
-
-		DrawData *data = draw->data;
-		draw->occlusionQuery = occlusionQuery;
-		draw->batchDataPool = &batchDataPool;
-		draw->numPrimitives = count;
-		draw->numPrimitivesPerBatch = numPrimitivesPerBatch;
-		draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
-		draw->topology = context->topology;
-		draw->provokingVertexMode = context->provokingVertexMode;
-		draw->indexType = indexType;
-		draw->lineRasterizationMode = context->lineRasterizationMode;
-
-		draw->vertexRoutine = vertexRoutine;
-		draw->setupRoutine = setupRoutine;
-		draw->pixelRoutine = pixelRoutine;
-		draw->setupPrimitives = setupPrimitives;
-		draw->setupState = setupState;
-
-		data->descriptorSets = context->descriptorSets;
-		data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
-		{
-			data->input[i] = context->input[i].buffer;
-			data->robustnessSize[i] = context->input[i].robustnessSize;
-			data->stride[i] = context->input[i].vertexStride;
-		}
-
-		data->indices = indexBuffer;
-		data->viewID = viewID;
-		data->instanceID = instanceID;
-		data->baseVertex = baseVertex;
-
-		if(pixelState.stencilActive)
-		{
-			data->stencil[0].set(context->frontStencil.reference, context->frontStencil.compareMask, context->frontStencil.writeMask);
-			data->stencil[1].set(context->backStencil.reference, context->backStencil.compareMask, context->backStencil.writeMask);
-		}
-
-		data->lineWidth = context->lineWidth;
-
-		data->factor = factor;
-
-		if(pixelState.alphaToCoverage)
-		{
-			if(ms == 4)
-			{
-				data->a2c0 = replicate(0.2f);
-				data->a2c1 = replicate(0.4f);
-				data->a2c2 = replicate(0.6f);
-				data->a2c3 = replicate(0.8f);
-			}
-			else if(ms == 2)
-			{
-				data->a2c0 = replicate(0.25f);
-				data->a2c1 = replicate(0.75f);
-			}
-			else ASSERT(false);
-		}
-
-		if(pixelState.occlusionEnabled)
-		{
-			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				data->occlusion[cluster] = 0;
-			}
-		}
-
-		// Viewport
-		{
-			float W = 0.5f * viewport.width;
-			float H = 0.5f * viewport.height;
-			float X0 = viewport.x + W;
-			float Y0 = viewport.y + H;
-			float N = viewport.minDepth;
-			float F = viewport.maxDepth;
-			float Z = F - N;
-			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-			if(context->isDrawTriangle(false))
-			{
-				N += context->depthBias;
-			}
-
-			data->WxF = replicate(W * subPixF);
-			data->HxF = replicate(H * subPixF);
-			data->X0xF = replicate(X0 * subPixF - subPixF / 2);
-			data->Y0xF = replicate(Y0 * subPixF - subPixF / 2);
-			data->halfPixelX = replicate(0.5f / W);
-			data->halfPixelY = replicate(0.5f / H);
-			data->viewportHeight = abs(viewport.height);
-			data->slopeDepthBias = context->slopeDepthBias;
-			data->depthRange = Z;
-			data->depthNear = N;
-		}
-
-		// Target
-		{
-			for(int index = 0; index < RENDERTARGETS; index++)
-			{
-				draw->renderTarget[index] = context->renderTarget[index];
-
-				if(draw->renderTarget[index])
-				{
-					data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_COLOR_BIT, 0, data->viewID);
-					data->colorPitchB[index] = context->renderTarget[index]->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
-					data->colorSliceB[index] = context->renderTarget[index]->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
-				}
-			}
-
-			draw->depthBuffer = context->depthBuffer;
-			draw->stencilBuffer = context->stencilBuffer;
-
-			if(draw->depthBuffer)
-			{
-				data->depthBuffer = (float*)context->depthBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_DEPTH_BIT, 0, data->viewID);
-				data->depthPitchB = context->depthBuffer->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
-				data->depthSliceB = context->depthBuffer->slicePitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
-			}
-
-			if(draw->stencilBuffer)
-			{
-				data->stencilBuffer = (unsigned char*)context->stencilBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_STENCIL_BIT, 0, data->viewID);
-				data->stencilPitchB = context->stencilBuffer->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
-				data->stencilSliceB = context->stencilBuffer->slicePitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
-			}
-		}
-
-		// Scissor
-		{
-			data->scissorX0 = clamp<int>(scissor.offset.x, 0, framebufferExtent.width);
-			data->scissorX1 = clamp<int>(scissor.offset.x + scissor.extent.width, 0, framebufferExtent.width);
-			data->scissorY0 = clamp<int>(scissor.offset.y, 0, framebufferExtent.height);
-			data->scissorY1 = clamp<int>(scissor.offset.y + scissor.extent.height, 0, framebufferExtent.height);
-		}
-
-		// Push constants
-		{
-			data->pushConstants = pushConstants;
-		}
-
-		draw->events = events;
-
-		DrawCall::run(draw, &drawTickets, clusterQueues);
-	}
-
-	void DrawCall::setup()
-	{
-		if(occlusionQuery != nullptr)
-		{
-			occlusionQuery->start();
-		}
-
-		if(events)
-		{
-			events->start();
-		}
-	}
-
-	void DrawCall::teardown()
-	{
-		if(events)
-		{
-			events->finish();
-			events = nullptr;
-		}
-
-		if (occlusionQuery != nullptr)
-		{
-			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				occlusionQuery->add(data->occlusion[cluster]);
-			}
-			occlusionQuery->finish();
-		}
-
-		vertexRoutine = {};
-		setupRoutine = {};
-		pixelRoutine = {};
-	}
-
-	void DrawCall::run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount])
-	{
-		draw->setup();
-
-		auto const numPrimitives = draw->numPrimitives;
-		auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
-		auto const numBatches = draw->numBatches;
-
-		auto ticket = tickets->take();
-		auto finally = marl::make_shared_finally([draw, ticket] {
-			MARL_SCOPED_EVENT("FINISH draw %d", draw->id);
-			draw->teardown();
-			ticket.done();
-		});
-
-		for (unsigned int batchId = 0; batchId < numBatches; batchId++)
-		{
-			auto batch = draw->batchDataPool->borrow();
-			batch->id = batchId;
-			batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
-			batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
-
-			for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-			{
-				batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
-			}
-
-			marl::schedule([draw, batch, finally] {
-
-				processVertices(draw.get(), batch.get());
-
-				if (!draw->setupState.rasterizerDiscard)
-				{
-					processPrimitives(draw.get(), batch.get());
-
-					if (batch->numVisible > 0)
-					{
-						processPixels(draw, batch, finally);
-						return;
-					}
-				}
-
-				for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-				{
-					batch->clusterTickets[cluster].done();
-				}
-			});
-		}
-	}
-
-	void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
-	{
-		MARL_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
-
-		unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
-		{
-			MARL_SCOPED_EVENT("processPrimitiveVertices");
-			processPrimitiveVertices(
-				triangleIndices,
-				draw->data->indices,
-				draw->indexType,
-				batch->firstPrimitive,
-				batch->numPrimitives,
-				draw->topology,
-				draw->provokingVertexMode);
-		}
-
-		auto& vertexTask = batch->vertexTask;
-		vertexTask.primitiveStart = batch->firstPrimitive;
-		// We're only using batch compaction for points, not lines
-		vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
-		if (vertexTask.vertexCache.drawCall != draw->id)
-		{
-			vertexTask.vertexCache.clear();
-			vertexTask.vertexCache.drawCall = draw->id;
-		}
-
-		draw->vertexRoutine(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
-	}
-
-	void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
-	{
-		MARL_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
-		auto triangles = &batch->triangles[0];
-		auto primitives = &batch->primitives[0];
-		batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
-	}
-
-	void DrawCall::processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
-	{
-		struct Data
-		{
-			Data(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
-				: draw(draw), batch(batch), finally(finally) {}
-			marl::Loan<DrawCall> draw;
-			marl::Loan<BatchData> batch;
-			std::shared_ptr<marl::Finally> finally;
-		};
-		auto data = std::make_shared<Data>(draw, batch, finally);
-		for (int cluster = 0; cluster < MaxClusterCount; cluster++)
-		{
-			batch->clusterTickets[cluster].onCall([data, cluster]
-			{
-				auto& draw = data->draw;
-				auto& batch = data->batch;
-				MARL_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
-				draw->pixelRoutine(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
-				batch->clusterTickets[cluster].done();
-			});
-		}
-	}
-
-	void Renderer::synchronize()
-	{
-		MARL_SCOPED_EVENT("synchronize");
-		auto ticket = drawTickets.take();
-		ticket.wait();
-		device->updateSamplingRoutineConstCache();
-		ticket.done();
-	}
-
-	void DrawCall::processPrimitiveVertices(
-		unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
-		const void *primitiveIndices,
-		VkIndexType indexType,
-		unsigned int start,
-		unsigned int triangleCount,
-		VkPrimitiveTopology topology,
-		VkProvokingVertexModeEXT provokingVertexMode)
-	{
-		if(!primitiveIndices)
-		{
-			struct LinearIndex
-			{
-				unsigned int operator[](unsigned int i) { return i; }
-			};
-
-			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, LinearIndex(), start, triangleCount))
-			{
-				return;
-			}
-		}
-		else
-		{
-			switch(indexType)
-			{
-			case VK_INDEX_TYPE_UINT16:
-				if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
-				{
-					return;
-				}
-				break;
-			case VK_INDEX_TYPE_UINT32:
-				if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
-				{
-					return;
-				}
-				break;
-			break;
-			default:
-				ASSERT(false);
-				return;
-			}
-		}
-
-		// setBatchIndices() takes care of the point case, since it's different due to the compaction
-		if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
-		{
-			// Repeat the last index to allow for SIMD width overrun.
-			triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
-			triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
-			triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
-		}
-	}
-
-	int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		const DrawData *data = drawCall->data;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++, triangles++)
-		{
-			Vertex &v0 = triangles->v0;
-			Vertex &v1 = triangles->v1;
-			Vertex &v2 = triangles->v2;
-
-			Polygon polygon(&v0.position, &v1.position, &v2.position);
-
-
-			if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
-			{
-				continue;
-			}
-
-			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
-			{
-				continue;
-			}
-
-			int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
-			if(clipFlagsOr != Clipper::CLIP_FINITE)
-			{
-				if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
-				{
-					continue;
-				}
-			}
-
-			if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
-			{
-				primitives += ms;
-				visible++;
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupWireframeTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto& state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++)
-		{
-			const Vertex &v0 = triangles[i].v0;
-			const Vertex &v1 = triangles[i].v1;
-			const Vertex &v2 = triangles[i].v2;
-
-			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
-
-			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
-			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
-			{
-				if(frontFacing) continue;
-			}
-			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-			{
-				if(!frontFacing) continue;
-			}
-
-			Triangle lines[3];
-			lines[0].v0 = v0;
-			lines[0].v1 = v1;
-			lines[1].v0 = v1;
-			lines[1].v1 = v2;
-			lines[2].v0 = v2;
-			lines[2].v1 = v0;
-
-			for(int i = 0; i < 3; i++)
-			{
-				if(setupLine(*primitives, lines[i], *drawCall))
-				{
-					primitives += ms;
-					visible++;
-				}
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupPointTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto& state = drawCall->setupState;
-
-		int ms = state.multiSample;
-		int visible = 0;
-
-		for(int i = 0; i < count; i++)
-		{
-			const Vertex &v0 = triangles[i].v0;
-			const Vertex &v1 = triangles[i].v1;
-			const Vertex &v2 = triangles[i].v2;
-
-			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
-
-			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
-			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
-			{
-				if(frontFacing) continue;
-			}
-			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-			{
-				if(!frontFacing) continue;
-			}
-
-			Triangle points[3];
-			points[0].v0 = v0;
-			points[1].v0 = v1;
-			points[2].v0 = v2;
-
-			for(int i = 0; i < 3; i++)
-			{
-				if(setupPoint(*primitives, points[i], *drawCall))
-				{
-					primitives += ms;
-					visible++;
-				}
-			}
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int visible = 0;
-		int ms = state.multiSample;
-
-		for(int i = 0; i < count; i++)
-		{
-			if(setupLine(*primitives, *triangles, *drawCall))
-			{
-				primitives += ms;
-				visible++;
-			}
-
-			triangles++;
-		}
-
-		return visible;
-	}
-
-	int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
-	{
-		auto &state = drawCall->setupState;
-
-		int visible = 0;
-		int ms = state.multiSample;
-
-		for(int i = 0; i < count; i++)
-		{
-			if(setupPoint(*primitives, *triangles, *drawCall))
-			{
-				primitives += ms;
-				visible++;
-			}
-
-			triangles++;
-		}
-
-		return visible;
-	}
-
-	bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
-	{
-		const DrawData &data = *draw.data;
-
-		float lineWidth = data.lineWidth;
-
-		Vertex &v0 = triangle.v0;
-		Vertex &v1 = triangle.v1;
-
-		if((v0.cullMask | v1.cullMask) == 0)
-		{
-			return false;
-		}
-
-		const float4 &P0 = v0.position;
-		const float4 &P1 = v1.position;
-
-		if(P0.w <= 0 && P1.w <= 0)
-		{
-			return false;
-		}
-
-		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-		const float W = data.WxF[0] * (1.0f / subPixF);
-		const float H = data.HxF[0] * (1.0f / subPixF);
-
-		float dx = W * (P1.x / P1.w - P0.x / P0.w);
-		float dy = H * (P1.y / P1.w - P0.y / P0.w);
-
-		if(dx == 0 && dy == 0)
-		{
-			return false;
-		}
-
-		if(draw.lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
-		{
-			// Rectangle centered on the line segment
-
-			float4 P[4];
-			int C[4];
-
-			P[0] = P0;
-			P[1] = P1;
-			P[2] = P1;
-			P[3] = P0;
-
-			float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
-
-			dx *= scale;
-			dy *= scale;
-
-			float dx0h = dx * P0.w / H;
-			float dy0w = dy * P0.w / W;
-
-			float dx1h = dx * P1.w / H;
-			float dy1w = dy * P1.w / W;
-
-			P[0].x += -dy0w;
-			P[0].y += +dx0h;
-			C[0] = Clipper::ComputeClipFlags(P[0]);
-
-			P[1].x += -dy1w;
-			P[1].y += +dx1h;
-			C[1] = Clipper::ComputeClipFlags(P[1]);
-
-			P[2].x += +dy1w;
-			P[2].y += -dx1h;
-			C[2] = Clipper::ComputeClipFlags(P[2]);
-
-			P[3].x += +dy0w;
-			P[3].y += -dx0h;
-			C[3] = Clipper::ComputeClipFlags(P[3]);
-
-			if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
-			{
-				Polygon polygon(P, 4);
-
-				int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-		else if(false)  // TODO(b/80135519): Deprecate
-		{
-			// Connecting diamonds polygon
-			// This shape satisfies the diamond test convention, except for the exit rule part.
-			// Line segments with overlapping endpoints have duplicate fragments.
-			// The ideal algorithm requires half-open line rasterization (b/80135519).
-
-			float4 P[8];
-			int C[8];
-
-			P[0] = P0;
-			P[1] = P0;
-			P[2] = P0;
-			P[3] = P0;
-			P[4] = P1;
-			P[5] = P1;
-			P[6] = P1;
-			P[7] = P1;
-
-			float dx0 = lineWidth * 0.5f * P0.w / W;
-			float dy0 = lineWidth * 0.5f * P0.w / H;
-
-			float dx1 = lineWidth * 0.5f * P1.w / W;
-			float dy1 = lineWidth * 0.5f * P1.w / H;
-
-			P[0].x += -dx0;
-			C[0] = Clipper::ComputeClipFlags(P[0]);
-
-			P[1].y += +dy0;
-			C[1] = Clipper::ComputeClipFlags(P[1]);
-
-			P[2].x += +dx0;
-			C[2] = Clipper::ComputeClipFlags(P[2]);
-
-			P[3].y += -dy0;
-			C[3] = Clipper::ComputeClipFlags(P[3]);
-
-			P[4].x += -dx1;
-			C[4] = Clipper::ComputeClipFlags(P[4]);
-
-			P[5].y += +dy1;
-			C[5] = Clipper::ComputeClipFlags(P[5]);
-
-			P[6].x += +dx1;
-			C[6] = Clipper::ComputeClipFlags(P[6]);
-
-			P[7].y += -dy1;
-			C[7] = Clipper::ComputeClipFlags(P[7]);
-
-			if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
-			{
-				float4 L[6];
-
-				if(dx > -dy)
-				{
-					if(dx > dy)   // Right
-					{
-						L[0] = P[0];
-						L[1] = P[1];
-						L[2] = P[5];
-						L[3] = P[6];
-						L[4] = P[7];
-						L[5] = P[3];
-					}
-					else   // Down
-					{
-						L[0] = P[0];
-						L[1] = P[4];
-						L[2] = P[5];
-						L[3] = P[6];
-						L[4] = P[2];
-						L[5] = P[3];
-					}
-				}
-				else
-				{
-					if(dx > dy)   // Up
-					{
-						L[0] = P[0];
-						L[1] = P[1];
-						L[2] = P[2];
-						L[3] = P[6];
-						L[4] = P[7];
-						L[5] = P[4];
-					}
-					else   // Left
-					{
-						L[0] = P[1];
-						L[1] = P[2];
-						L[2] = P[3];
-						L[3] = P[7];
-						L[4] = P[4];
-						L[5] = P[5];
-					}
-				}
-
-				Polygon polygon(L, 6);
-
-				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7];
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-		else
-		{
-			// Parallelogram approximating Bresenham line
-			// This algorithm does not satisfy the ideal diamond-exit rule, but does avoid the
-			// duplicate fragment rasterization problem and satisfies all of Vulkan's minimum
-			// requirements for Bresenham line segment rasterization.
-
-			float4 P[8];
-			P[0] = P0;
-			P[1] = P0;
-			P[2] = P0;
-			P[3] = P0;
-			P[4] = P1;
-			P[5] = P1;
-			P[6] = P1;
-			P[7] = P1;
-
-			float dx0 = lineWidth * 0.5f * P0.w / W;
-			float dy0 = lineWidth * 0.5f * P0.w / H;
-
-			float dx1 = lineWidth * 0.5f * P1.w / W;
-			float dy1 = lineWidth * 0.5f * P1.w / H;
-
-			P[0].x += -dx0;
-			P[1].y += +dy0;
-			P[2].x += +dx0;
-			P[3].y += -dy0;
-			P[4].x += -dx1;
-			P[5].y += +dy1;
-			P[6].x += +dx1;
-			P[7].y += -dy1;
-
-			float4 L[4];
-
-			if(dx > -dy)
-			{
-				if(dx > dy)   // Right
-				{
-					L[0] = P[1];
-					L[1] = P[5];
-					L[2] = P[7];
-					L[3] = P[3];
-				}
-				else   // Down
-				{
-					L[0] = P[0];
-					L[1] = P[4];
-					L[2] = P[6];
-					L[3] = P[2];
-				}
-			}
-			else
-			{
-				if(dx > dy)   // Up
-				{
-					L[0] = P[0];
-					L[1] = P[2];
-					L[2] = P[6];
-					L[3] = P[4];
-				}
-				else   // Left
-				{
-					L[0] = P[1];
-					L[1] = P[3];
-					L[2] = P[7];
-					L[3] = P[5];
-				}
-			}
-
-			int C0 = Clipper::ComputeClipFlags(L[0]);
-			int C1 = Clipper::ComputeClipFlags(L[1]);
-			int C2 = Clipper::ComputeClipFlags(L[2]);
-			int C3 = Clipper::ComputeClipFlags(L[3]);
-
-			if((C0 & C1 & C2 & C3) == Clipper::CLIP_FINITE)
-			{
-				Polygon polygon(L, 4);
-
-				int clipFlagsOr = C0 | C1 | C2 | C3;
-
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
-				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
-					{
-						return false;
-					}
-				}
-
-				return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
-			}
-		}
-
+	default:
+		ASSERT(false);
 		return false;
 	}
 
-	bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	return true;
+}
+
+DrawCall::DrawCall()
+{
+	data = (DrawData*)allocate(sizeof(DrawData));
+	data->constants = &constants;
+}
+
+DrawCall::~DrawCall()
+{
+	deallocate(data);
+}
+
+Renderer::Renderer(vk::Device* device) : device(device)
+{
+	VertexProcessor::setRoutineCacheSize(1024);
+	PixelProcessor::setRoutineCacheSize(1024);
+	SetupProcessor::setRoutineCacheSize(1024);
+}
+
+Renderer::~Renderer()
+{
+	drawTickets.take().wait();
+}
+
+// Renderer objects have to be mem aligned to the alignment provided in the class declaration
+void* Renderer::operator new(size_t size)
+{
+	ASSERT(size == sizeof(Renderer));  // This operator can't be called from a derived class
+	return vk::allocate(sizeof(Renderer), alignof(Renderer), vk::DEVICE_MEMORY, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+}
+
+void Renderer::operator delete(void* mem)
+{
+	vk::deallocate(mem, vk::DEVICE_MEMORY);
+}
+
+void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
+		TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
+		PushConstantStorage const & pushConstants, bool update)
+{
+	if(count == 0) { return; }
+
+	auto id = nextDrawID++;
+	MARL_SCOPED_EVENT("draw %d", id);
+
+	#ifndef NDEBUG
 	{
-		const DrawData &data = *draw.data;
-
-		Vertex &v = triangle.v0;
-
-		if(v.cullMask == 0)
+		unsigned int minPrimitives = 1;
+		unsigned int maxPrimitives = 1 << 21;
+		if(count < minPrimitives || count > maxPrimitives)
 		{
-			return false;
+			return;
+		}
+	}
+	#endif
+
+	int ms = context->sampleCount;
+
+	if(!context->multiSampleMask)
+	{
+		return;
+	}
+
+	marl::Pool<sw::DrawCall>::Loan draw;
+	{
+		MARL_SCOPED_EVENT("drawCallPool.borrow()");
+		draw = drawCallPool.borrow();
+	}
+	draw->id = id;
+
+	if(update)
+	{
+		MARL_SCOPED_EVENT("update");
+		vertexState = VertexProcessor::update(context);
+		setupState = SetupProcessor::update(context);
+		pixelState = PixelProcessor::update(context);
+
+		vertexRoutine = VertexProcessor::routine(vertexState, context->pipelineLayout, context->vertexShader, context->descriptorSets);
+		setupRoutine = SetupProcessor::routine(setupState);
+		pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
+	}
+
+	DrawCall::SetupFunction setupPrimitives = nullptr;
+	unsigned int numPrimitivesPerBatch = MaxBatchSize / ms;
+
+	if(context->isDrawTriangle(false))
+	{
+		switch(context->polygonMode)
+		{
+		case VK_POLYGON_MODE_FILL:
+			setupPrimitives = &DrawCall::setupSolidTriangles;
+			break;
+		case VK_POLYGON_MODE_LINE:
+			setupPrimitives = &DrawCall::setupWireframeTriangles;
+			numPrimitivesPerBatch /= 3;
+			break;
+		case VK_POLYGON_MODE_POINT:
+			setupPrimitives = &DrawCall::setupPointTriangles;
+			numPrimitivesPerBatch /= 3;
+			break;
+		default:
+			UNSUPPORTED("polygon mode: %d", int(context->polygonMode));
+			return;
+		}
+	}
+	else if(context->isDrawLine(false))
+	{
+		setupPrimitives = &DrawCall::setupLines;
+	}
+	else  // Point primitive topology
+	{
+		setupPrimitives = &DrawCall::setupPoints;
+	}
+
+	DrawData *data = draw->data;
+	draw->occlusionQuery = occlusionQuery;
+	draw->batchDataPool = &batchDataPool;
+	draw->numPrimitives = count;
+	draw->numPrimitivesPerBatch = numPrimitivesPerBatch;
+	draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
+	draw->topology = context->topology;
+	draw->provokingVertexMode = context->provokingVertexMode;
+	draw->indexType = indexType;
+	draw->lineRasterizationMode = context->lineRasterizationMode;
+
+	draw->vertexRoutine = vertexRoutine;
+	draw->setupRoutine = setupRoutine;
+	draw->pixelRoutine = pixelRoutine;
+	draw->setupPrimitives = setupPrimitives;
+	draw->setupState = setupState;
+
+	data->descriptorSets = context->descriptorSets;
+	data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
+	{
+		data->input[i] = context->input[i].buffer;
+		data->robustnessSize[i] = context->input[i].robustnessSize;
+		data->stride[i] = context->input[i].vertexStride;
+	}
+
+	data->indices = indexBuffer;
+	data->viewID = viewID;
+	data->instanceID = instanceID;
+	data->baseVertex = baseVertex;
+
+	if(pixelState.stencilActive)
+	{
+		data->stencil[0].set(context->frontStencil.reference, context->frontStencil.compareMask, context->frontStencil.writeMask);
+		data->stencil[1].set(context->backStencil.reference, context->backStencil.compareMask, context->backStencil.writeMask);
+	}
+
+	data->lineWidth = context->lineWidth;
+
+	data->factor = factor;
+
+	if(pixelState.alphaToCoverage)
+	{
+		if(ms == 4)
+		{
+			data->a2c0 = replicate(0.2f);
+			data->a2c1 = replicate(0.4f);
+			data->a2c2 = replicate(0.6f);
+			data->a2c3 = replicate(0.8f);
+		}
+		else if(ms == 2)
+		{
+			data->a2c0 = replicate(0.25f);
+			data->a2c1 = replicate(0.75f);
+		}
+		else ASSERT(false);
+	}
+
+	if(pixelState.occlusionEnabled)
+	{
+		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			data->occlusion[cluster] = 0;
+		}
+	}
+
+	// Viewport
+	{
+		float W = 0.5f * viewport.width;
+		float H = 0.5f * viewport.height;
+		float X0 = viewport.x + W;
+		float Y0 = viewport.y + H;
+		float N = viewport.minDepth;
+		float F = viewport.maxDepth;
+		float Z = F - N;
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		if(context->isDrawTriangle(false))
+		{
+			N += context->depthBias;
 		}
 
-		float pSize = v.pointSize;
+		data->WxF = replicate(W * subPixF);
+		data->HxF = replicate(H * subPixF);
+		data->X0xF = replicate(X0 * subPixF - subPixF / 2);
+		data->Y0xF = replicate(Y0 * subPixF - subPixF / 2);
+		data->halfPixelX = replicate(0.5f / W);
+		data->halfPixelY = replicate(0.5f / H);
+		data->viewportHeight = abs(viewport.height);
+		data->slopeDepthBias = context->slopeDepthBias;
+		data->depthRange = Z;
+		data->depthNear = N;
+	}
 
-		pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
+	// Target
+	{
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			draw->renderTarget[index] = context->renderTarget[index];
+
+			if(draw->renderTarget[index])
+			{
+				data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_COLOR_BIT, 0, data->viewID);
+				data->colorPitchB[index] = context->renderTarget[index]->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
+				data->colorSliceB[index] = context->renderTarget[index]->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
+			}
+		}
+
+		draw->depthBuffer = context->depthBuffer;
+		draw->stencilBuffer = context->stencilBuffer;
+
+		if(draw->depthBuffer)
+		{
+			data->depthBuffer = (float*)context->depthBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_DEPTH_BIT, 0, data->viewID);
+			data->depthPitchB = context->depthBuffer->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
+			data->depthSliceB = context->depthBuffer->slicePitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
+		}
+
+		if(draw->stencilBuffer)
+		{
+			data->stencilBuffer = (unsigned char*)context->stencilBuffer->getOffsetPointer({0, 0, 0}, VK_IMAGE_ASPECT_STENCIL_BIT, 0, data->viewID);
+			data->stencilPitchB = context->stencilBuffer->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
+			data->stencilSliceB = context->stencilBuffer->slicePitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
+		}
+	}
+
+	// Scissor
+	{
+		data->scissorX0 = clamp<int>(scissor.offset.x, 0, framebufferExtent.width);
+		data->scissorX1 = clamp<int>(scissor.offset.x + scissor.extent.width, 0, framebufferExtent.width);
+		data->scissorY0 = clamp<int>(scissor.offset.y, 0, framebufferExtent.height);
+		data->scissorY1 = clamp<int>(scissor.offset.y + scissor.extent.height, 0, framebufferExtent.height);
+	}
+
+	// Push constants
+	{
+		data->pushConstants = pushConstants;
+	}
+
+	draw->events = events;
+
+	DrawCall::run(draw, &drawTickets, clusterQueues);
+}
+
+void DrawCall::setup()
+{
+	if(occlusionQuery != nullptr)
+	{
+		occlusionQuery->start();
+	}
+
+	if(events)
+	{
+		events->start();
+	}
+}
+
+void DrawCall::teardown()
+{
+	if(events)
+	{
+		events->finish();
+		events = nullptr;
+	}
+
+	if (occlusionQuery != nullptr)
+	{
+		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			occlusionQuery->add(data->occlusion[cluster]);
+		}
+		occlusionQuery->finish();
+	}
+
+	vertexRoutine = {};
+	setupRoutine = {};
+	pixelRoutine = {};
+}
+
+void DrawCall::run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount])
+{
+	draw->setup();
+
+	auto const numPrimitives = draw->numPrimitives;
+	auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
+	auto const numBatches = draw->numBatches;
+
+	auto ticket = tickets->take();
+	auto finally = marl::make_shared_finally([draw, ticket] {
+		MARL_SCOPED_EVENT("FINISH draw %d", draw->id);
+		draw->teardown();
+		ticket.done();
+	});
+
+	for (unsigned int batchId = 0; batchId < numBatches; batchId++)
+	{
+		auto batch = draw->batchDataPool->borrow();
+		batch->id = batchId;
+		batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
+		batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
+
+		for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
+		}
+
+		marl::schedule([draw, batch, finally] {
+
+			processVertices(draw.get(), batch.get());
+
+			if (!draw->setupState.rasterizerDiscard)
+			{
+				processPrimitives(draw.get(), batch.get());
+
+				if (batch->numVisible > 0)
+				{
+					processPixels(draw, batch, finally);
+					return;
+				}
+			}
+
+			for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+			{
+				batch->clusterTickets[cluster].done();
+			}
+		});
+	}
+}
+
+void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
+{
+	MARL_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
+
+	unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
+	{
+		MARL_SCOPED_EVENT("processPrimitiveVertices");
+		processPrimitiveVertices(
+			triangleIndices,
+			draw->data->indices,
+			draw->indexType,
+			batch->firstPrimitive,
+			batch->numPrimitives,
+			draw->topology,
+			draw->provokingVertexMode);
+	}
+
+	auto& vertexTask = batch->vertexTask;
+	vertexTask.primitiveStart = batch->firstPrimitive;
+	// We're only using batch compaction for points, not lines
+	vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
+	if (vertexTask.vertexCache.drawCall != draw->id)
+	{
+		vertexTask.vertexCache.clear();
+		vertexTask.vertexCache.drawCall = draw->id;
+	}
+
+	draw->vertexRoutine(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
+}
+
+void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
+{
+	MARL_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
+	auto triangles = &batch->triangles[0];
+	auto primitives = &batch->primitives[0];
+	batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
+}
+
+void DrawCall::processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
+{
+	struct Data
+	{
+		Data(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally)
+			: draw(draw), batch(batch), finally(finally) {}
+		marl::Loan<DrawCall> draw;
+		marl::Loan<BatchData> batch;
+		std::shared_ptr<marl::Finally> finally;
+	};
+	auto data = std::make_shared<Data>(draw, batch, finally);
+	for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+	{
+		batch->clusterTickets[cluster].onCall([data, cluster]
+		{
+			auto& draw = data->draw;
+			auto& batch = data->batch;
+			MARL_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
+			draw->pixelRoutine(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
+			batch->clusterTickets[cluster].done();
+		});
+	}
+}
+
+void Renderer::synchronize()
+{
+	MARL_SCOPED_EVENT("synchronize");
+	auto ticket = drawTickets.take();
+	ticket.wait();
+	device->updateSamplingRoutineConstCache();
+	ticket.done();
+}
+
+void DrawCall::processPrimitiveVertices(
+	unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+	const void *primitiveIndices,
+	VkIndexType indexType,
+	unsigned int start,
+	unsigned int triangleCount,
+	VkPrimitiveTopology topology,
+	VkProvokingVertexModeEXT provokingVertexMode)
+{
+	if(!primitiveIndices)
+	{
+		struct LinearIndex
+		{
+			unsigned int operator[](unsigned int i) { return i; }
+		};
+
+		if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, LinearIndex(), start, triangleCount))
+		{
+			return;
+		}
+	}
+	else
+	{
+		switch(indexType)
+		{
+		case VK_INDEX_TYPE_UINT16:
+			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
+			{
+				return;
+			}
+			break;
+		case VK_INDEX_TYPE_UINT32:
+			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
+			{
+				return;
+			}
+			break;
+		break;
+		default:
+			ASSERT(false);
+			return;
+		}
+	}
+
+	// setBatchIndices() takes care of the point case, since it's different due to the compaction
+	if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
+	{
+		// Repeat the last index to allow for SIMD width overrun.
+		triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+	}
+}
+
+int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	const DrawData *data = drawCall->data;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++, triangles++)
+	{
+		Vertex &v0 = triangles->v0;
+		Vertex &v1 = triangles->v1;
+		Vertex &v2 = triangles->v2;
+
+		Polygon polygon(&v0.position, &v1.position, &v2.position);
+
+
+		if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
+		{
+			continue;
+		}
+
+		if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
+		{
+			continue;
+		}
+
+		int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
+		if(clipFlagsOr != Clipper::CLIP_FINITE)
+		{
+			if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
+			{
+				continue;
+			}
+		}
+
+		if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
+		{
+			primitives += ms;
+			visible++;
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupWireframeTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto& state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++)
+	{
+		const Vertex &v0 = triangles[i].v0;
+		const Vertex &v1 = triangles[i].v1;
+		const Vertex &v2 = triangles[i].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		{
+			if(frontFacing) continue;
+		}
+		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+		{
+			if(!frontFacing) continue;
+		}
+
+		Triangle lines[3];
+		lines[0].v0 = v0;
+		lines[0].v1 = v1;
+		lines[1].v0 = v1;
+		lines[1].v1 = v2;
+		lines[2].v0 = v2;
+		lines[2].v1 = v0;
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupLine(*primitives, lines[i], *drawCall))
+			{
+				primitives += ms;
+				visible++;
+			}
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupPointTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto& state = drawCall->setupState;
+
+	int ms = state.multiSample;
+	int visible = 0;
+
+	for(int i = 0; i < count; i++)
+	{
+		const Vertex &v0 = triangles[i].v0;
+		const Vertex &v1 = triangles[i].v1;
+		const Vertex &v2 = triangles[i].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		{
+			if(frontFacing) continue;
+		}
+		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+		{
+			if(!frontFacing) continue;
+		}
+
+		Triangle points[3];
+		points[0].v0 = v0;
+		points[1].v0 = v1;
+		points[2].v0 = v2;
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupPoint(*primitives, points[i], *drawCall))
+			{
+				primitives += ms;
+				visible++;
+			}
+		}
+	}
+
+	return visible;
+}
+
+int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int visible = 0;
+	int ms = state.multiSample;
+
+	for(int i = 0; i < count; i++)
+	{
+		if(setupLine(*primitives, *triangles, *drawCall))
+		{
+			primitives += ms;
+			visible++;
+		}
+
+		triangles++;
+	}
+
+	return visible;
+}
+
+int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
+{
+	auto &state = drawCall->setupState;
+
+	int visible = 0;
+	int ms = state.multiSample;
+
+	for(int i = 0; i < count; i++)
+	{
+		if(setupPoint(*primitives, *triangles, *drawCall))
+		{
+			primitives += ms;
+			visible++;
+		}
+
+		triangles++;
+	}
+
+	return visible;
+}
+
+bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+{
+	const DrawData &data = *draw.data;
+
+	float lineWidth = data.lineWidth;
+
+	Vertex &v0 = triangle.v0;
+	Vertex &v1 = triangle.v1;
+
+	if((v0.cullMask | v1.cullMask) == 0)
+	{
+		return false;
+	}
+
+	const float4 &P0 = v0.position;
+	const float4 &P1 = v1.position;
+
+	if(P0.w <= 0 && P1.w <= 0)
+	{
+		return false;
+	}
+
+	constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+	const float W = data.WxF[0] * (1.0f / subPixF);
+	const float H = data.HxF[0] * (1.0f / subPixF);
+
+	float dx = W * (P1.x / P1.w - P0.x / P0.w);
+	float dy = H * (P1.y / P1.w - P0.y / P0.w);
+
+	if(dx == 0 && dy == 0)
+	{
+		return false;
+	}
+
+	if(draw.lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
+	{
+		// Rectangle centered on the line segment
 
 		float4 P[4];
 		int C[4];
 
-		P[0] = v.position;
-		P[1] = v.position;
-		P[2] = v.position;
-		P[3] = v.position;
+		P[0] = P0;
+		P[1] = P1;
+		P[2] = P1;
+		P[3] = P0;
 
-		const float X = pSize * P[0].w * data.halfPixelX[0];
-		const float Y = pSize * P[0].w * data.halfPixelY[0];
+		float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
 
-		P[0].x -= X;
-		P[0].y += Y;
+		dx *= scale;
+		dy *= scale;
+
+		float dx0h = dx * P0.w / H;
+		float dy0w = dy * P0.w / W;
+
+		float dx1h = dx * P1.w / H;
+		float dy1w = dy * P1.w / W;
+
+		P[0].x += -dy0w;
+		P[0].y += +dx0h;
 		C[0] = Clipper::ComputeClipFlags(P[0]);
 
-		P[1].x += X;
-		P[1].y += Y;
+		P[1].x += -dy1w;
+		P[1].y += +dx1h;
 		C[1] = Clipper::ComputeClipFlags(P[1]);
 
-		P[2].x += X;
-		P[2].y -= Y;
+		P[2].x += +dy1w;
+		P[2].y += -dx1h;
 		C[2] = Clipper::ComputeClipFlags(P[2]);
 
-		P[3].x -= X;
-		P[3].y -= Y;
+		P[3].x += +dy0w;
+		P[3].y += -dx0h;
 		C[3] = Clipper::ComputeClipFlags(P[3]);
 
-		Polygon polygon(P, 4);
-
 		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
 		{
+			Polygon polygon(P, 4);
+
 			int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
 
 			if(clipFlagsOr != Clipper::CLIP_FINITE)
@@ -1144,57 +879,322 @@
 				}
 			}
 
-			triangle.v1 = triangle.v0;
-			triangle.v2 = triangle.v0;
-
-			constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
-
-			triangle.v1.projected.x += iround(subPixF * 0.5f * pSize);
-			triangle.v2.projected.y -= iround(subPixF * 0.5f * pSize) * (data.HxF[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
 		}
+	}
+	else if(false)  // TODO(b/80135519): Deprecate
+	{
+		// Connecting diamonds polygon
+		// This shape satisfies the diamond test convention, except for the exit rule part.
+		// Line segments with overlapping endpoints have duplicate fragments.
+		// The ideal algorithm requires half-open line rasterization (b/80135519).
 
+		float4 P[8];
+		int C[8];
+
+		P[0] = P0;
+		P[1] = P0;
+		P[2] = P0;
+		P[3] = P0;
+		P[4] = P1;
+		P[5] = P1;
+		P[6] = P1;
+		P[7] = P1;
+
+		float dx0 = lineWidth * 0.5f * P0.w / W;
+		float dy0 = lineWidth * 0.5f * P0.w / H;
+
+		float dx1 = lineWidth * 0.5f * P1.w / W;
+		float dy1 = lineWidth * 0.5f * P1.w / H;
+
+		P[0].x += -dx0;
+		C[0] = Clipper::ComputeClipFlags(P[0]);
+
+		P[1].y += +dy0;
+		C[1] = Clipper::ComputeClipFlags(P[1]);
+
+		P[2].x += +dx0;
+		C[2] = Clipper::ComputeClipFlags(P[2]);
+
+		P[3].y += -dy0;
+		C[3] = Clipper::ComputeClipFlags(P[3]);
+
+		P[4].x += -dx1;
+		C[4] = Clipper::ComputeClipFlags(P[4]);
+
+		P[5].y += +dy1;
+		C[5] = Clipper::ComputeClipFlags(P[5]);
+
+		P[6].x += +dx1;
+		C[6] = Clipper::ComputeClipFlags(P[6]);
+
+		P[7].y += -dy1;
+		C[7] = Clipper::ComputeClipFlags(P[7]);
+
+		if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
+		{
+			float4 L[6];
+
+			if(dx > -dy)
+			{
+				if(dx > dy)   // Right
+				{
+					L[0] = P[0];
+					L[1] = P[1];
+					L[2] = P[5];
+					L[3] = P[6];
+					L[4] = P[7];
+					L[5] = P[3];
+				}
+				else   // Down
+				{
+					L[0] = P[0];
+					L[1] = P[4];
+					L[2] = P[5];
+					L[3] = P[6];
+					L[4] = P[2];
+					L[5] = P[3];
+				}
+			}
+			else
+			{
+				if(dx > dy)   // Up
+				{
+					L[0] = P[0];
+					L[1] = P[1];
+					L[2] = P[2];
+					L[3] = P[6];
+					L[4] = P[7];
+					L[5] = P[4];
+				}
+				else   // Left
+				{
+					L[0] = P[1];
+					L[1] = P[2];
+					L[2] = P[3];
+					L[3] = P[7];
+					L[4] = P[4];
+					L[5] = P[5];
+				}
+			}
+
+			Polygon polygon(L, 6);
+
+			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7];
+
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+				{
+					return false;
+				}
+			}
+
+			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
+		}
+	}
+	else
+	{
+		// Parallelogram approximating Bresenham line
+		// This algorithm does not satisfy the ideal diamond-exit rule, but does avoid the
+		// duplicate fragment rasterization problem and satisfies all of Vulkan's minimum
+		// requirements for Bresenham line segment rasterization.
+
+		float4 P[8];
+		P[0] = P0;
+		P[1] = P0;
+		P[2] = P0;
+		P[3] = P0;
+		P[4] = P1;
+		P[5] = P1;
+		P[6] = P1;
+		P[7] = P1;
+
+		float dx0 = lineWidth * 0.5f * P0.w / W;
+		float dy0 = lineWidth * 0.5f * P0.w / H;
+
+		float dx1 = lineWidth * 0.5f * P1.w / W;
+		float dy1 = lineWidth * 0.5f * P1.w / H;
+
+		P[0].x += -dx0;
+		P[1].y += +dy0;
+		P[2].x += +dx0;
+		P[3].y += -dy0;
+		P[4].x += -dx1;
+		P[5].y += +dy1;
+		P[6].x += +dx1;
+		P[7].y += -dy1;
+
+		float4 L[4];
+
+		if(dx > -dy)
+		{
+			if(dx > dy)   // Right
+			{
+				L[0] = P[1];
+				L[1] = P[5];
+				L[2] = P[7];
+				L[3] = P[3];
+			}
+			else   // Down
+			{
+				L[0] = P[0];
+				L[1] = P[4];
+				L[2] = P[6];
+				L[3] = P[2];
+			}
+		}
+		else
+		{
+			if(dx > dy)   // Up
+			{
+				L[0] = P[0];
+				L[1] = P[2];
+				L[2] = P[6];
+				L[3] = P[4];
+			}
+			else   // Left
+			{
+				L[0] = P[1];
+				L[1] = P[3];
+				L[2] = P[7];
+				L[3] = P[5];
+			}
+		}
+
+		int C0 = Clipper::ComputeClipFlags(L[0]);
+		int C1 = Clipper::ComputeClipFlags(L[1]);
+		int C2 = Clipper::ComputeClipFlags(L[2]);
+		int C3 = Clipper::ComputeClipFlags(L[3]);
+
+		if((C0 & C1 & C2 & C3) == Clipper::CLIP_FINITE)
+		{
+			Polygon polygon(L, 4);
+
+			int clipFlagsOr = C0 | C1 | C2 | C3;
+
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+				{
+					return false;
+				}
+			}
+
+			return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
+		}
+	}
+
+	return false;
+}
+
+bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+{
+	const DrawData &data = *draw.data;
+
+	Vertex &v = triangle.v0;
+
+	if(v.cullMask == 0)
+	{
 		return false;
 	}
 
-	void Renderer::addQuery(vk::Query *query)
+	float pSize = v.pointSize;
+
+	pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
+
+	float4 P[4];
+	int C[4];
+
+	P[0] = v.position;
+	P[1] = v.position;
+	P[2] = v.position;
+	P[3] = v.position;
+
+	const float X = pSize * P[0].w * data.halfPixelX[0];
+	const float Y = pSize * P[0].w * data.halfPixelY[0];
+
+	P[0].x -= X;
+	P[0].y += Y;
+	C[0] = Clipper::ComputeClipFlags(P[0]);
+
+	P[1].x += X;
+	P[1].y += Y;
+	C[1] = Clipper::ComputeClipFlags(P[1]);
+
+	P[2].x += X;
+	P[2].y -= Y;
+	C[2] = Clipper::ComputeClipFlags(P[2]);
+
+	P[3].x -= X;
+	P[3].y -= Y;
+	C[3] = Clipper::ComputeClipFlags(P[3]);
+
+	Polygon polygon(P, 4);
+
+	if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
 	{
-		ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
-		ASSERT(!occlusionQuery);
+		int clipFlagsOr = C[0] | C[1] | C[2] | C[3];
 
-		occlusionQuery = query;
-	}
-
-	void Renderer::removeQuery(vk::Query *query)
-	{
-		ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
-		ASSERT(occlusionQuery == query);
-
-		occlusionQuery = nullptr;
-	}
-
-	void Renderer::advanceInstanceAttributes(Stream* inputs)
-	{
-		for(uint32_t i = 0; i < vk::MAX_VERTEX_INPUT_BINDINGS; i++)
+		if(clipFlagsOr != Clipper::CLIP_FINITE)
 		{
-			auto &attrib = inputs[i];
-			if (attrib.count && attrib.instanceStride && (attrib.instanceStride < attrib.robustnessSize))
+			if(!Clipper::Clip(polygon, clipFlagsOr, draw))
 			{
-				// Under the casts: attrib.buffer += attrib.instanceStride
-				attrib.buffer = (void const *)((uintptr_t)attrib.buffer + attrib.instanceStride);
-				attrib.robustnessSize -= attrib.instanceStride;
+				return false;
 			}
 		}
+
+		triangle.v1 = triangle.v0;
+		triangle.v2 = triangle.v0;
+
+		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;
+
+		triangle.v1.projected.x += iround(subPixF * 0.5f * pSize);
+		triangle.v2.projected.y -= iround(subPixF * 0.5f * pSize) * (data.HxF[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+		return draw.setupRoutine(&primitive, &triangle, &polygon, &data);
 	}
 
-	void Renderer::setViewport(const VkViewport &viewport)
-	{
-		this->viewport = viewport;
-	}
-
-	void Renderer::setScissor(const VkRect2D &scissor)
-	{
-		this->scissor = scissor;
-	}
-
+	return false;
 }
+
+void Renderer::addQuery(vk::Query *query)
+{
+	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
+	ASSERT(!occlusionQuery);
+
+	occlusionQuery = query;
+}
+
+void Renderer::removeQuery(vk::Query *query)
+{
+	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
+	ASSERT(occlusionQuery == query);
+
+	occlusionQuery = nullptr;
+}
+
+void Renderer::advanceInstanceAttributes(Stream* inputs)
+{
+	for(uint32_t i = 0; i < vk::MAX_VERTEX_INPUT_BINDINGS; i++)
+	{
+		auto &attrib = inputs[i];
+		if (attrib.count && attrib.instanceStride && (attrib.instanceStride < attrib.robustnessSize))
+		{
+			// Under the casts: attrib.buffer += attrib.instanceStride
+			attrib.buffer = (void const *)((uintptr_t)attrib.buffer + attrib.instanceStride);
+			attrib.robustnessSize -= attrib.instanceStride;
+		}
+	}
+}
+
+void Renderer::setViewport(const VkViewport &viewport)
+{
+	this->viewport = viewport;
+}
+
+void Renderer::setScissor(const VkRect2D &scissor)
+{
+	this->scissor = scissor;
+}
+
+}  // namespace sw

diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index ac38616..1598c11 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp

@@ -33,213 +33,214 @@
 #include <mutex>
 #include <thread>
 
-namespace vk
+namespace vk {
+
+class DescriptorSet;
+class Device;
+class Query;
+
+}  // namespace vk
+
+namespace sw {
+
+struct DrawCall;
+class PixelShader;
+class VertexShader;
+struct Task;
+class TaskEvents;
+class Resource;
+struct Constants;
+
+static constexpr int MaxBatchSize = 128;
+static constexpr int MaxBatchCount = 16;
+static constexpr int MaxClusterCount = 16;
+static constexpr int MaxDrawCount = 16;
+
+using TriangleBatch = std::array<Triangle, MaxBatchSize>;
+using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
+
+struct DrawData
 {
-	class DescriptorSet;
-	class Device;
-	class Query;
-}
+	const Constants *constants;
 
-namespace sw
+	vk::DescriptorSet::Bindings descriptorSets = {};
+	vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
+
+	const void *input[MAX_INTERFACE_COMPONENTS / 4];
+	unsigned int robustnessSize[MAX_INTERFACE_COMPONENTS / 4];
+	unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
+	const void *indices;
+
+	int instanceID;
+	int baseVertex;
+	float lineWidth;
+	int viewID;
+
+	PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
+	PixelProcessor::Factor factor;
+	unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
+
+	float4 WxF;
+	float4 HxF;
+	float4 X0xF;
+	float4 Y0xF;
+	float4 halfPixelX;
+	float4 halfPixelY;
+	float viewportHeight;
+	float slopeDepthBias;
+	float depthRange;
+	float depthNear;
+
+	unsigned int *colorBuffer[RENDERTARGETS];
+	int colorPitchB[RENDERTARGETS];
+	int colorSliceB[RENDERTARGETS];
+	float *depthBuffer;
+	int depthPitchB;
+	int depthSliceB;
+	unsigned char *stencilBuffer;
+	int stencilPitchB;
+	int stencilSliceB;
+
+	int scissorX0;
+	int scissorX1;
+	int scissorY0;
+	int scissorY1;
+
+	float4 a2c0;
+	float4 a2c1;
+	float4 a2c2;
+	float4 a2c3;
+
+	PushConstantStorage pushConstants;
+};
+
+struct DrawCall
 {
-	struct DrawCall;
-	class PixelShader;
-	class VertexShader;
-	struct Task;
-	class TaskEvents;
-	class Resource;
-	struct Constants;
-
-	static constexpr int MaxBatchSize = 128;
-	static constexpr int MaxBatchCount = 16;
-	static constexpr int MaxClusterCount = 16;
-	static constexpr int MaxDrawCount = 16;
-
-	using TriangleBatch = std::array<Triangle, MaxBatchSize>;
-	using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
-
-	struct DrawData
+	struct BatchData
 	{
-		const Constants *constants;
+		using Pool = marl::BoundedPool<BatchData, MaxBatchCount, marl::PoolPolicy::Preserve>;
 
-		vk::DescriptorSet::Bindings descriptorSets = {};
-		vk::DescriptorSet::DynamicOffsets descriptorDynamicOffsets = {};
-
-		const void *input[MAX_INTERFACE_COMPONENTS / 4];
-		unsigned int robustnessSize[MAX_INTERFACE_COMPONENTS / 4];
-		unsigned int stride[MAX_INTERFACE_COMPONENTS / 4];
-		const void *indices;
-
-		int instanceID;
-		int baseVertex;
-		float lineWidth;
-		int viewID;
-
-		PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
-		PixelProcessor::Factor factor;
-		unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
-
-		float4 WxF;
-		float4 HxF;
-		float4 X0xF;
-		float4 Y0xF;
-		float4 halfPixelX;
-		float4 halfPixelY;
-		float viewportHeight;
-		float slopeDepthBias;
-		float depthRange;
-		float depthNear;
-
-		unsigned int *colorBuffer[RENDERTARGETS];
-		int colorPitchB[RENDERTARGETS];
-		int colorSliceB[RENDERTARGETS];
-		float *depthBuffer;
-		int depthPitchB;
-		int depthSliceB;
-		unsigned char *stencilBuffer;
-		int stencilPitchB;
-		int stencilSliceB;
-
-		int scissorX0;
-		int scissorX1;
-		int scissorY0;
-		int scissorY1;
-
-		float4 a2c0;
-		float4 a2c1;
-		float4 a2c2;
-		float4 a2c3;
-
-		PushConstantStorage pushConstants;
-	};
-
-	struct DrawCall
-	{
-		struct BatchData
-		{
-			using Pool = marl::BoundedPool<BatchData, MaxBatchCount, marl::PoolPolicy::Preserve>;
-
-			TriangleBatch triangles;
-			PrimitiveBatch primitives;
-			VertexTask vertexTask;
-			unsigned int id;
-			unsigned int firstPrimitive;
-			unsigned int numPrimitives;
-			int numVisible;
-			marl::Ticket clusterTickets[MaxClusterCount];
-		};
-
-		using Pool = marl::BoundedPool<DrawCall, MaxDrawCount, marl::PoolPolicy::Preserve>;
-		using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-
-		DrawCall();
-		~DrawCall();
-
-		static void run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount]);
-		static void processVertices(DrawCall* draw, BatchData* batch);
-		static void processPrimitives(DrawCall* draw, BatchData* batch);
-		static void processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally);
-		void setup();
-		void teardown();
-
-		int id;
-
-		BatchData::Pool *batchDataPool;
+		TriangleBatch triangles;
+		PrimitiveBatch primitives;
+		VertexTask vertexTask;
+		unsigned int id;
+		unsigned int firstPrimitive;
 		unsigned int numPrimitives;
-		unsigned int numPrimitivesPerBatch;
-		unsigned int numBatches;
-
-		VkPrimitiveTopology topology;
-		VkProvokingVertexModeEXT provokingVertexMode;
-		VkIndexType indexType;
-		VkLineRasterizationModeEXT lineRasterizationMode;
-
-		VertexProcessor::RoutineType vertexRoutine;
-		SetupProcessor::RoutineType setupRoutine;
-		PixelProcessor::RoutineType pixelRoutine;
-
-		SetupFunction setupPrimitives;
-		SetupProcessor::State setupState;
-
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
-		TaskEvents *events;
-
-		vk::Query* occlusionQuery;
-
-		DrawData *data;
-
-		static void processPrimitiveVertices(
-				unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
-				const void *primitiveIndices,
-				VkIndexType indexType,
-				unsigned int start,
-				unsigned int triangleCount,
-				VkPrimitiveTopology topology,
-				VkProvokingVertexModeEXT provokingVertexMode);
-
-		static int setupSolidTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupWireframeTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupPointTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
-		static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-		static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-
-		static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+		int numVisible;
+		marl::Ticket clusterTickets[MaxClusterCount];
 	};
 
-	class alignas(16) Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
-	{
-	public:
-		Renderer(vk::Device* device);
+	using Pool = marl::BoundedPool<DrawCall, MaxDrawCount, marl::PoolPolicy::Preserve>;
+	using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
 
-		virtual ~Renderer();
+	DrawCall();
+	~DrawCall();
 
-		void* operator new(size_t size);
-		void operator delete(void* mem);
+	static void run(const marl::Loan<DrawCall>& draw, marl::Ticket::Queue* tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount]);
+	static void processVertices(DrawCall* draw, BatchData* batch);
+	static void processPrimitives(DrawCall* draw, BatchData* batch);
+	static void processPixels(const marl::Loan<DrawCall>& draw, const marl::Loan<BatchData>& batch, const std::shared_ptr<marl::Finally>& finally);
+	void setup();
+	void teardown();
 
-		bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
+	int id;
 
-		void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
-				TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
-				PushConstantStorage const & pushConstants, bool update = true);
+	BatchData::Pool *batchDataPool;
+	unsigned int numPrimitives;
+	unsigned int numPrimitivesPerBatch;
+	unsigned int numBatches;
 
-		// Viewport & Clipper
-		void setViewport(const VkViewport &viewport);
-		void setScissor(const VkRect2D &scissor);
+	VkPrimitiveTopology topology;
+	VkProvokingVertexModeEXT provokingVertexMode;
+	VkIndexType indexType;
+	VkLineRasterizationModeEXT lineRasterizationMode;
 
-		void addQuery(vk::Query *query);
-		void removeQuery(vk::Query *query);
+	VertexProcessor::RoutineType vertexRoutine;
+	SetupProcessor::RoutineType setupRoutine;
+	PixelProcessor::RoutineType pixelRoutine;
 
-		void advanceInstanceAttributes(Stream* inputs);
+	SetupFunction setupPrimitives;
+	SetupProcessor::State setupState;
 
-		void synchronize();
+	vk::ImageView *renderTarget[RENDERTARGETS];
+	vk::ImageView *depthBuffer;
+	vk::ImageView *stencilBuffer;
+	TaskEvents *events;
 
-	private:
-		VkViewport viewport;
-		VkRect2D scissor;
+	vk::Query* occlusionQuery;
 
-		DrawCall::Pool drawCallPool;
-		DrawCall::BatchData::Pool batchDataPool;
+	DrawData *data;
 
-		std::atomic<int> nextDrawID = {0};
+	static void processPrimitiveVertices(
+			unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+			const void *primitiveIndices,
+			VkIndexType indexType,
+			unsigned int start,
+			unsigned int triangleCount,
+			VkPrimitiveTopology topology,
+			VkProvokingVertexModeEXT provokingVertexMode);
 
-		vk::Query *occlusionQuery = nullptr;
-		marl::Ticket::Queue drawTickets;
-		marl::Ticket::Queue clusterQueues[MaxClusterCount];
+	static int setupSolidTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupWireframeTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupPointTriangles(Triangle* triangles, Primitive* primitives, const DrawCall* drawCall, int count);
+	static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+	static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
 
-		VertexProcessor::State vertexState;
-		SetupProcessor::State setupState;
-		PixelProcessor::State pixelState;
+	static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+	static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+};
 
-		VertexProcessor::RoutineType vertexRoutine;
-		SetupProcessor::RoutineType setupRoutine;
-		PixelProcessor::RoutineType pixelRoutine;
+class alignas(16) Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+{
+public:
+	Renderer(vk::Device* device);
 
-		vk::Device* device;
-	};
+	virtual ~Renderer();
 
-}
+	void* operator new(size_t size);
+	void operator delete(void* mem);
+
+	bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
+
+	void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
+			TaskEvents *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D& framebufferExtent,
+			PushConstantStorage const & pushConstants, bool update = true);
+
+	// Viewport & Clipper
+	void setViewport(const VkViewport &viewport);
+	void setScissor(const VkRect2D &scissor);
+
+	void addQuery(vk::Query *query);
+	void removeQuery(vk::Query *query);
+
+	void advanceInstanceAttributes(Stream* inputs);
+
+	void synchronize();
+
+private:
+	VkViewport viewport;
+	VkRect2D scissor;
+
+	DrawCall::Pool drawCallPool;
+	DrawCall::BatchData::Pool batchDataPool;
+
+	std::atomic<int> nextDrawID = {0};
+
+	vk::Query *occlusionQuery = nullptr;
+	marl::Ticket::Queue drawTickets;
+	marl::Ticket::Queue clusterQueues[MaxClusterCount];
+
+	VertexProcessor::State vertexState;
+	SetupProcessor::State setupState;
+	PixelProcessor::State pixelState;
+
+	VertexProcessor::RoutineType vertexRoutine;
+	SetupProcessor::RoutineType setupRoutine;
+	PixelProcessor::RoutineType pixelRoutine;
+
+	vk::Device* device;
+};
+
+}  // namespace sw
 
 #endif   // sw_Renderer_hpp

diff --git a/src/Device/RoutineCache.hpp b/src/Device/RoutineCache.hpp
index b015c3b..9bde0d5 100644
--- a/src/Device/RoutineCache.hpp
+++ b/src/Device/RoutineCache.hpp

@@ -19,15 +19,16 @@
 
 #include "Reactor/Reactor.hpp"
 
-namespace sw
-{
-	using namespace rr;
+namespace sw {
 
-	template<class State>
-	using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
+using namespace rr;
 
-	template<class State, class FunctionType>
-	using RoutineCacheT = LRUCache<State, RoutineT<FunctionType>>;
+template<class State>
+using RoutineCache = LRUCache<State, std::shared_ptr<Routine>>;
+
+template<class State, class FunctionType>
+using RoutineCacheT = LRUCache<State, RoutineT<FunctionType>>;
+
 }
 
 #endif   // sw_RoutineCache_hpp

diff --git a/src/Device/Sampler.hpp b/src/Device/Sampler.hpp
index 0c0bd9d..a836ce2 100644
--- a/src/Device/Sampler.hpp
+++ b/src/Device/Sampler.hpp

@@ -20,100 +20,98 @@
 #include "System/Types.hpp"
 #include "Vulkan/VkFormat.h"
 
-namespace vk
+namespace vk { class Image; }
+
+namespace sw {
+
+struct Mipmap
 {
-	class Image;
-}
+	const void *buffer;
 
-namespace sw
+	short4 uHalf;
+	short4 vHalf;
+	short4 wHalf;
+	int4 width;
+	int4 height;
+	int4 depth;
+	short4 onePitchP;
+	int4 pitchP;
+	int4 sliceP;
+	int4 samplePitchP;
+	int4 sampleMax;
+};
+
+struct Texture
 {
-	struct Mipmap
-	{
-		const void *buffer;
+	Mipmap mipmap[MIPMAP_LEVELS];
 
-		short4 uHalf;
-		short4 vHalf;
-		short4 wHalf;
-		int4 width;
-		int4 height;
-		int4 depth;
-		short4 onePitchP;
-		int4 pitchP;
-		int4 sliceP;
-		int4 samplePitchP;
-		int4 sampleMax;
-	};
+	float4 widthWidthHeightHeight;
+	float4 width;
+	float4 height;
+	float4 depth;
+};
 
-	struct Texture
-	{
-		Mipmap mipmap[MIPMAP_LEVELS];
+enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	FILTER_POINT,
+	FILTER_GATHER,
+	FILTER_MIN_POINT_MAG_LINEAR,
+	FILTER_MIN_LINEAR_MAG_POINT,
+	FILTER_LINEAR,
+	FILTER_ANISOTROPIC,
 
-		float4 widthWidthHeightHeight;
-		float4 width;
-		float4 height;
-		float4 depth;
-	};
+	FILTER_LAST = FILTER_ANISOTROPIC
+};
 
-	enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		FILTER_POINT,
-		FILTER_GATHER,
-		FILTER_MIN_POINT_MAG_LINEAR,
-		FILTER_MIN_LINEAR_MAG_POINT,
-		FILTER_LINEAR,
-		FILTER_ANISOTROPIC,
+enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	MIPMAP_NONE,
+	MIPMAP_POINT,
+	MIPMAP_LINEAR,
 
-		FILTER_LAST = FILTER_ANISOTROPIC
-	};
+	MIPMAP_LAST = MIPMAP_LINEAR
+};
 
-	enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		MIPMAP_NONE,
-		MIPMAP_POINT,
-		MIPMAP_LINEAR,
+enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+{
+	ADDRESSING_UNUSED,
+	ADDRESSING_WRAP,
+	ADDRESSING_CLAMP,
+	ADDRESSING_MIRROR,
+	ADDRESSING_MIRRORONCE,
+	ADDRESSING_BORDER,     // Single color
+	ADDRESSING_SEAMLESS,   // Border of pixels
+	ADDRESSING_CUBEFACE,   // Cube face layer
+	ADDRESSING_LAYER,      // Array layer
+	ADDRESSING_TEXELFETCH,
 
-		MIPMAP_LAST = MIPMAP_LINEAR
-	};
+	ADDRESSING_LAST = ADDRESSING_TEXELFETCH
+};
 
-	enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		ADDRESSING_UNUSED,
-		ADDRESSING_WRAP,
-		ADDRESSING_CLAMP,
-		ADDRESSING_MIRROR,
-		ADDRESSING_MIRRORONCE,
-		ADDRESSING_BORDER,     // Single color
-		ADDRESSING_SEAMLESS,   // Border of pixels
-		ADDRESSING_CUBEFACE,   // Cube face layer
-		ADDRESSING_LAYER,      // Array layer
-		ADDRESSING_TEXELFETCH,
+struct Sampler
+{
+	VkImageViewType textureType;
+	vk::Format textureFormat;
+	FilterType textureFilter;
+	AddressingMode addressingModeU;
+	AddressingMode addressingModeV;
+	AddressingMode addressingModeW;
+	AddressingMode addressingModeY;
+	MipmapType mipmapFilter;
+	VkComponentMapping swizzle;
+	int gatherComponent;
+	bool highPrecisionFiltering;
+	bool compareEnable;
+	VkCompareOp compareOp;
+	VkBorderColor border;
+	bool unnormalizedCoordinates;
+	bool largeTexture;
 
-		ADDRESSING_LAST = ADDRESSING_TEXELFETCH
-	};
+	VkSamplerYcbcrModelConversion ycbcrModel;
+	bool studioSwing;    // Narrow range
+	bool swappedChroma;  // Cb/Cr components in reverse order
+};
 
-	struct Sampler
-	{
-		VkImageViewType textureType;
-		vk::Format textureFormat;
-		FilterType textureFilter;
-		AddressingMode addressingModeU;
-		AddressingMode addressingModeV;
-		AddressingMode addressingModeW;
-		AddressingMode addressingModeY;
-		MipmapType mipmapFilter;
-		VkComponentMapping swizzle;
-		int gatherComponent;
-		bool highPrecisionFiltering;
-		bool compareEnable;
-		VkCompareOp compareOp;
-		VkBorderColor border;
-		bool unnormalizedCoordinates;
-		bool largeTexture;
-
-		VkSamplerYcbcrModelConversion ycbcrModel;
-		bool studioSwing;    // Narrow range
-		bool swappedChroma;  // Cb/Cr components in reverse order
-	};
-}
+}  // namespace sw
 
 #endif   // sw_Sampler_hpp

diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
index 69371ab..df55e1a 100644
--- a/src/Device/SetupProcessor.cpp
+++ b/src/Device/SetupProcessor.cpp

@@ -25,98 +25,99 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+uint32_t SetupProcessor::States::computeHash()
 {
-	uint32_t SetupProcessor::States::computeHash()
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
 	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
+		hash ^= state[i];
 	}
 
-	bool SetupProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	SetupProcessor::SetupProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	SetupProcessor::~SetupProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	SetupProcessor::State SetupProcessor::update(const sw::Context* context) const
-	{
-		State state;
-
-		bool vPosZW = (context->pixelShader && context->pixelShader->hasBuiltinInput(spv::BuiltInFragCoord));
-
-		state.isDrawPoint = context->isDrawPoint(true);
-		state.isDrawLine = context->isDrawLine(true);
-		state.isDrawTriangle = context->isDrawTriangle(true);
-		state.applySlopeDepthBias = context->isDrawTriangle(false) && (context->slopeDepthBias != 0.0f);
-		state.interpolateZ = context->depthBufferActive() || vPosZW;
-		state.interpolateW = context->pixelShader != nullptr;
-		state.frontFace = context->frontFace;
-		state.cullMode = context->cullMode;
-
-		state.multiSample = context->sampleCount;
-		state.rasterizerDiscard = context->rasterizerDiscard;
-
-		state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
-		state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
-
-		if (context->pixelShader)
-		{
-			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
-			{
-				state.gradient[interpolant] = context->pixelShader->inputs[interpolant];
-			}
-		}
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	SetupProcessor::RoutineType SetupProcessor::routine(const State &state)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)
-		{
-			SetupRoutine *generator = new SetupRoutine(state);
-			generator->generate();
-			routine = generator->getRoutine();
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
-	}
-
-	void SetupProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
+	return hash;
 }
+
+bool SetupProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+SetupProcessor::SetupProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+SetupProcessor::~SetupProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+SetupProcessor::State SetupProcessor::update(const sw::Context* context) const
+{
+	State state;
+
+	bool vPosZW = (context->pixelShader && context->pixelShader->hasBuiltinInput(spv::BuiltInFragCoord));
+
+	state.isDrawPoint = context->isDrawPoint(true);
+	state.isDrawLine = context->isDrawLine(true);
+	state.isDrawTriangle = context->isDrawTriangle(true);
+	state.applySlopeDepthBias = context->isDrawTriangle(false) && (context->slopeDepthBias != 0.0f);
+	state.interpolateZ = context->depthBufferActive() || vPosZW;
+	state.interpolateW = context->pixelShader != nullptr;
+	state.frontFace = context->frontFace;
+	state.cullMode = context->cullMode;
+
+	state.multiSample = context->sampleCount;
+	state.rasterizerDiscard = context->rasterizerDiscard;
+
+	state.numClipDistances = context->vertexShader->getNumOutputClipDistances();
+	state.numCullDistances = context->vertexShader->getNumOutputCullDistances();
+
+	if (context->pixelShader)
+	{
+		for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
+		{
+			state.gradient[interpolant] = context->pixelShader->inputs[interpolant];
+		}
+	}
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+SetupProcessor::RoutineType SetupProcessor::routine(const State &state)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)
+	{
+		SetupRoutine *generator = new SetupRoutine(state);
+		generator->generate();
+		routine = generator->getRoutine();
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+void SetupProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+}  // namespace sw

diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
index 683c93c..4b6d5a8 100644
--- a/src/Device/SetupProcessor.hpp
+++ b/src/Device/SetupProcessor.hpp

@@ -21,65 +21,66 @@
 #include "RoutineCache.hpp"
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+struct Primitive;
+struct Triangle;
+struct Polygon;
+struct Vertex;
+struct DrawCall;
+struct DrawData;
+
+using SetupFunction = FunctionT<int(Primitive* primitive, const Triangle* triangle, const Polygon* polygon, const DrawData* draw)>;
+
+class SetupProcessor
 {
-	struct Primitive;
-	struct Triangle;
-	struct Polygon;
-	struct Vertex;
-	struct DrawCall;
-	struct DrawData;
-
-	using SetupFunction = FunctionT<int(Primitive* primitive, const Triangle* triangle, const Polygon* polygon, const DrawData* draw)>;
-
-	class SetupProcessor
+public:
+	struct States : Memset<States>
 	{
-	public:
-		struct States : Memset<States>
-		{
-			States() : Memset(this, 0) {}
+		States() : Memset(this, 0) {}
 
-			uint32_t computeHash();
+		uint32_t computeHash();
 
-			bool isDrawPoint               : 1;
-			bool isDrawLine                : 1;
-			bool isDrawTriangle            : 1;
-			bool applySlopeDepthBias       : 1;
-			bool interpolateZ              : 1;
-			bool interpolateW              : 1;
-			VkFrontFace frontFace          : BITS(VK_FRONT_FACE_MAX_ENUM);
-			VkCullModeFlags cullMode       : BITS(VK_CULL_MODE_FLAG_BITS_MAX_ENUM);
-			unsigned int multiSample       : 3;   // 1, 2 or 4
-			bool rasterizerDiscard         : 1;
-			unsigned int numClipDistances  : 4; // [0 - 8]
-			unsigned int numCullDistances  : 4; // [0 - 8]
+		bool isDrawPoint               : 1;
+		bool isDrawLine                : 1;
+		bool isDrawTriangle            : 1;
+		bool applySlopeDepthBias       : 1;
+		bool interpolateZ              : 1;
+		bool interpolateW              : 1;
+		VkFrontFace frontFace          : BITS(VK_FRONT_FACE_MAX_ENUM);
+		VkCullModeFlags cullMode       : BITS(VK_CULL_MODE_FLAG_BITS_MAX_ENUM);
+		unsigned int multiSample       : 3;   // 1, 2 or 4
+		bool rasterizerDiscard         : 1;
+		unsigned int numClipDistances  : 4; // [0 - 8]
+		unsigned int numCullDistances  : 4; // [0 - 8]
 
-			SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
-		};
-
-		struct State : States
-		{
-			bool operator==(const State &states) const;
-
-			uint32_t hash;
-		};
-
-		using RoutineType = SetupFunction::RoutineType;
-
-		SetupProcessor();
-
-		~SetupProcessor();
-
-	protected:
-		State update(const sw::Context* context) const;
-		RoutineType routine(const State &state);
-
-		void setRoutineCacheSize(int cacheSize);
-
-	private:
-		using RoutineCacheType = RoutineCacheT<State, SetupFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &states) const;
+
+		uint32_t hash;
+	};
+
+	using RoutineType = SetupFunction::RoutineType;
+
+	SetupProcessor();
+
+	~SetupProcessor();
+
+protected:
+	State update(const sw::Context* context) const;
+	RoutineType routine(const State &state);
+
+	void setRoutineCacheSize(int cacheSize);
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, SetupFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_SetupProcessor_hpp

diff --git a/src/Device/Stream.hpp b/src/Device/Stream.hpp
index b6bb56c..f83d97a 100644
--- a/src/Device/Stream.hpp
+++ b/src/Device/Stream.hpp

@@ -17,37 +17,38 @@
 
 #include "System/Types.hpp"
 
-namespace sw
+namespace sw {
+
+enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
 {
-	enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
-	{
-		STREAMTYPE_COLOR,     // 4 normalized unsigned bytes, ZYXW order
-		STREAMTYPE_FLOAT,     // Normalization ignored
-		STREAMTYPE_BYTE,
-		STREAMTYPE_SBYTE,
-		STREAMTYPE_SHORT,
-		STREAMTYPE_USHORT,
-		STREAMTYPE_INT,
-		STREAMTYPE_UINT,
-		STREAMTYPE_HALF,      // Normalization ignored
-		STREAMTYPE_2_10_10_10_INT,
-		STREAMTYPE_2_10_10_10_UINT,
+	STREAMTYPE_COLOR,     // 4 normalized unsigned bytes, ZYXW order
+	STREAMTYPE_FLOAT,     // Normalization ignored
+	STREAMTYPE_BYTE,
+	STREAMTYPE_SBYTE,
+	STREAMTYPE_SHORT,
+	STREAMTYPE_USHORT,
+	STREAMTYPE_INT,
+	STREAMTYPE_UINT,
+	STREAMTYPE_HALF,      // Normalization ignored
+	STREAMTYPE_2_10_10_10_INT,
+	STREAMTYPE_2_10_10_10_UINT,
 
-		STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
-	};
+	STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
+};
 
-	struct Stream
-	{
-		const void *buffer = nullptr;
-		unsigned int robustnessSize = 0;
-		unsigned int vertexStride = 0;
-		unsigned int instanceStride = 0;
-		StreamType type = STREAMTYPE_FLOAT;
-		unsigned char count = 0;
-		bool normalized = false;
-		unsigned int offset = 0;
-		unsigned int binding = 0;
-	};
-}
+struct Stream
+{
+	const void *buffer = nullptr;
+	unsigned int robustnessSize = 0;
+	unsigned int vertexStride = 0;
+	unsigned int instanceStride = 0;
+	StreamType type = STREAMTYPE_FLOAT;
+	unsigned char count = 0;
+	bool normalized = false;
+	unsigned int offset = 0;
+	unsigned int binding = 0;
+};
+
+}  // namespace sw
 
 #endif   // sw_Stream_hpp

diff --git a/src/Device/Triangle.hpp b/src/Device/Triangle.hpp
index 8a91fab..7cb4055 100644
--- a/src/Device/Triangle.hpp
+++ b/src/Device/Triangle.hpp

@@ -17,14 +17,15 @@
 
 #include "Vertex.hpp"
 
-namespace sw
+namespace sw {
+
+struct Triangle
 {
-	struct Triangle
-	{
-		Vertex V0;
-		Vertex V1;
-		Vertex V2;
-	};
-}
+	Vertex V0;
+	Vertex V1;
+	Vertex V2;
+};
+
+}  // namespace sw
 
 #endif   // sw_Triangle_hpp

diff --git a/src/Device/Vector.cpp b/src/Device/Vector.cpp
index b58f15e..511b51f 100644
--- a/src/Device/Vector.cpp
+++ b/src/Device/Vector.cpp

@@ -17,159 +17,160 @@
 #include "Matrix.hpp"
 #include "System/Math.hpp"
 
-namespace sw
+namespace sw {
+
+Vector Vector::operator+() const
 {
-	Vector Vector::operator+() const
-	{
-		return *this;
-	}
-
-	Vector Vector::operator-() const
-	{
-		return Vector(-x, -y, -z);
-	}
-
-	Vector &Vector::operator+=(const Vector &v)
-	{
-		x += v.x;
-		y += v.y;
-		z += v.z;
-
-		return *this;
-	}
-
-	Vector &Vector::operator-=(const Vector &v)
-	{
-		x -= v.x;
-		y -= v.y;
-		z -= v.z;
-
-		return *this;
-	}
-
-	Vector &Vector::operator*=(float s)
-	{
-		x *= s;
-		y *= s;
-		z *= s;
-
-		return *this;
-	}
-
-	Vector &Vector::operator/=(float s)
-	{
-		float r = 1.0f / s;
-
-		return *this *= r;
-	}
-
-	bool operator==(const Vector &U, const Vector &v)
-	{
-		if(U.x == v.x && U.y == v.y && U.z == v.z)
-			return true;
-		else
-			return false;
-	}
-
-	bool operator!=(const Vector &U, const Vector &v)
-	{
-		if(U.x != v.x || U.y != v.y || U.z != v.z)
-			return true;
-		else
-			return false;
-	}
-
-	bool operator>(const Vector &u, const Vector &v)
-	{
-		if((u^2) > (v^2))
-			return true;
-		else
-			return false;
-	}
-
-	bool operator<(const Vector &u, const Vector &v)
-	{
-		if((u^2) < (v^2))
-			return true;
-		else
-			return false;
-	}
-
-	Vector operator+(const Vector &u, const Vector &v)
-	{
-		return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
-	}
-
-	Vector operator-(const Vector &u, const Vector &v)
-	{
-		return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
-	}
-
-	float operator*(const Vector &u, const Vector &v)
-	{
-		return u.x * v.x + u.y * v.y + u.z * v.z;
-	}
-
-	Vector operator*(float s, const Vector &v)
-	{
-		return Vector(s * v.x, s * v.y, s * v.z);
-	}
-
-	Vector operator*(const Vector &v, float s)
-	{
-		return Vector(v.x * s, v.y * s, v.z * s);
-	}
-
-	Vector operator/(const Vector &v, float s)
-	{
-		float r = 1.0f / s;
-
-		return Vector(v.x * r, v.y * r, v.z * r);
-	}
-
-	float operator^(const Vector &u, const Vector &v)
-	{
-		return acos(u / Vector::N(u) * v / Vector::N(v));
-	}
-
-	Vector operator%(const Vector &u, const Vector &v)
-	{
-		return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
-	}
-
-	Vector operator*(const Matrix &M, const Vector &v)
-	{
-		return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
-		              M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
-		              M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
-	}
-
-	Vector operator*(const Vector &v, const Matrix &M)
-	{
-		return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
-		              v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
-		              v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
-	}
-
-	Vector &operator*=(Vector &v, const Matrix &M)
-	{
-		return v = v * M;
-	}
-
-	float Vector::N(const Vector &v)
-	{
-		return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-	}
-
-	float Vector::N2(const Vector &v)
-	{
-		return v.x*v.x + v.y*v.y + v.z*v.z;
-	}
-
-	Vector lerp(const Vector &u, const Vector &v, float t)
-	{
-		return Vector(u.x + t * (v.x - u.x),
-		              u.y + t * (v.y - u.y),
-		              u.z + t * (v.z - u.z));
-	}
+	return *this;
 }
+
+Vector Vector::operator-() const
+{
+	return Vector(-x, -y, -z);
+}
+
+Vector &Vector::operator+=(const Vector &v)
+{
+	x += v.x;
+	y += v.y;
+	z += v.z;
+
+	return *this;
+}
+
+Vector &Vector::operator-=(const Vector &v)
+{
+	x -= v.x;
+	y -= v.y;
+	z -= v.z;
+
+	return *this;
+}
+
+Vector &Vector::operator*=(float s)
+{
+	x *= s;
+	y *= s;
+	z *= s;
+
+	return *this;
+}
+
+Vector &Vector::operator/=(float s)
+{
+	float r = 1.0f / s;
+
+	return *this *= r;
+}
+
+bool operator==(const Vector &U, const Vector &v)
+{
+	if(U.x == v.x && U.y == v.y && U.z == v.z)
+		return true;
+	else
+		return false;
+}
+
+bool operator!=(const Vector &U, const Vector &v)
+{
+	if(U.x != v.x || U.y != v.y || U.z != v.z)
+		return true;
+	else
+		return false;
+}
+
+bool operator>(const Vector &u, const Vector &v)
+{
+	if((u^2) > (v^2))
+		return true;
+	else
+		return false;
+}
+
+bool operator<(const Vector &u, const Vector &v)
+{
+	if((u^2) < (v^2))
+		return true;
+	else
+		return false;
+}
+
+Vector operator+(const Vector &u, const Vector &v)
+{
+	return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
+}
+
+Vector operator-(const Vector &u, const Vector &v)
+{
+	return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
+}
+
+float operator*(const Vector &u, const Vector &v)
+{
+	return u.x * v.x + u.y * v.y + u.z * v.z;
+}
+
+Vector operator*(float s, const Vector &v)
+{
+	return Vector(s * v.x, s * v.y, s * v.z);
+}
+
+Vector operator*(const Vector &v, float s)
+{
+	return Vector(v.x * s, v.y * s, v.z * s);
+}
+
+Vector operator/(const Vector &v, float s)
+{
+	float r = 1.0f / s;
+
+	return Vector(v.x * r, v.y * r, v.z * r);
+}
+
+float operator^(const Vector &u, const Vector &v)
+{
+	return acos(u / Vector::N(u) * v / Vector::N(v));
+}
+
+Vector operator%(const Vector &u, const Vector &v)
+{
+	return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
+}
+
+Vector operator*(const Matrix &M, const Vector &v)
+{
+	return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
+	              M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
+	              M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
+}
+
+Vector operator*(const Vector &v, const Matrix &M)
+{
+	return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
+	              v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
+	              v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
+}
+
+Vector &operator*=(Vector &v, const Matrix &M)
+{
+	return v = v * M;
+}
+
+float Vector::N(const Vector &v)
+{
+	return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+}
+
+float Vector::N2(const Vector &v)
+{
+	return v.x*v.x + v.y*v.y + v.z*v.z;
+}
+
+Vector lerp(const Vector &u, const Vector &v, float t)
+{
+	return Vector(u.x + t * (v.x - u.x),
+	              u.y + t * (v.y - u.y),
+	              u.z + t * (v.z - u.z));
+}
+
+}  // namespace sw

diff --git a/src/Device/Vector.hpp b/src/Device/Vector.hpp
index e7f261d..0df6f5a 100644
--- a/src/Device/Vector.hpp
+++ b/src/Device/Vector.hpp

@@ -15,139 +15,143 @@
 #ifndef Vector_hpp
 #define Vector_hpp
 
-namespace sw
+namespace sw {
+
+struct Point;
+struct Matrix;
+struct Plane;
+
+struct Vector
 {
-	struct Point;
-	struct Matrix;
-	struct Plane;
+	Vector();
+	Vector(const int i);
+	Vector(const Vector &v);
+	Vector(const Point &p);
+	Vector(float v_x, float v_y, float v_z);
 
-	struct Vector
+	Vector &operator=(const Vector &v);
+
+	union
 	{
-		Vector();
-		Vector(const int i);
-		Vector(const Vector &v);
-		Vector(const Point &p);
-		Vector(float v_x, float v_y, float v_z);
+		float v[3];
 
-		Vector &operator=(const Vector &v);
-
-		union
+		struct
 		{
-			float v[3];
-
-			struct
-			{
-				float x;
-				float y;
-				float z;
-			};
+			float x;
+			float y;
+			float z;
 		};
-
-		float &operator[](int i);
-		float &operator()(int i);
-
-		const float &operator[](int i) const;
-		const float &operator()(int i) const;
-
-		Vector operator+() const;
-		Vector operator-() const;
-
-		Vector &operator+=(const Vector &v);
-		Vector &operator-=(const Vector &v);
-		Vector &operator*=(float s);
-		Vector &operator/=(float s);
-
-		friend bool operator==(const Vector &u, const Vector &v);
-		friend bool operator!=(const Vector &u, const Vector &v);
-
-		friend Vector operator+(const Vector &u, const Vector &v);
-		friend Vector operator-(const Vector &u, const Vector &v);
-		friend float operator*(const Vector &u, const Vector &v);   // Dot product
-		friend Vector operator*(float s, const Vector &v);
-		friend Vector operator*(const Vector &v, float s);
-		friend Vector operator/(const Vector &v, float s);
-		friend float operator^(const Vector &u, const Vector &v);   // Angle between vectors
-		friend Vector operator%(const Vector &u, const Vector &v);   // Cross product
-
-		friend Vector operator*(const Matrix &M, const Vector& v);
-		friend Vector operator*(const Vector &v, const Matrix &M);
-		friend Vector &operator*=(Vector &v, const Matrix &M);
-
-		static float N(const Vector &v);   // Norm
-		static float N2(const Vector &v);   // Squared norm
-
-		static Vector mirror(const Vector &v, const Plane &p);
-		static Vector reflect(const Vector &v, const Plane &p);
-		static Vector lerp(const Vector &u, const Vector &v, float t);
 	};
-}
+
+	float &operator[](int i);
+	float &operator()(int i);
+
+	const float &operator[](int i) const;
+	const float &operator()(int i) const;
+
+	Vector operator+() const;
+	Vector operator-() const;
+
+	Vector &operator+=(const Vector &v);
+	Vector &operator-=(const Vector &v);
+	Vector &operator*=(float s);
+	Vector &operator/=(float s);
+
+	friend bool operator==(const Vector &u, const Vector &v);
+	friend bool operator!=(const Vector &u, const Vector &v);
+
+	friend Vector operator+(const Vector &u, const Vector &v);
+	friend Vector operator-(const Vector &u, const Vector &v);
+	friend float operator*(const Vector &u, const Vector &v);   // Dot product
+	friend Vector operator*(float s, const Vector &v);
+	friend Vector operator*(const Vector &v, float s);
+	friend Vector operator/(const Vector &v, float s);
+	friend float operator^(const Vector &u, const Vector &v);   // Angle between vectors
+	friend Vector operator%(const Vector &u, const Vector &v);   // Cross product
+
+	friend Vector operator*(const Matrix &M, const Vector& v);
+	friend Vector operator*(const Vector &v, const Matrix &M);
+	friend Vector &operator*=(Vector &v, const Matrix &M);
+
+	static float N(const Vector &v);   // Norm
+	static float N2(const Vector &v);   // Squared norm
+
+	static Vector mirror(const Vector &v, const Plane &p);
+	static Vector reflect(const Vector &v, const Plane &p);
+	static Vector lerp(const Vector &u, const Vector &v, float t);
+};
+
+}  // namespace sw
+
+/* Inline implementation */
 
 #include "Point.hpp"
 
-namespace sw
+namespace sw {
+
+inline Vector::Vector()
 {
-	inline Vector::Vector()
-	{
-	}
-
-	inline Vector::Vector(const int i)
-	{
-		const float s = (float)i;
-
-		x = s;
-		y = s;
-		z = s;
-	}
-
-	inline Vector::Vector(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-	}
-
-	inline Vector::Vector(const Point &P)
-	{
-		x = P.x;
-		y = P.y;
-		z = P.z;
-	}
-
-	inline Vector::Vector(float v_x, float v_y, float v_z)
-	{
-		x = v_x;
-		y = v_y;
-		z = v_z;
-	}
-
-	inline Vector &Vector::operator=(const Vector &v)
-	{
-		x = v.x;
-		y = v.y;
-		z = v.z;
-
-		return *this;
-	}
-
-	inline float &Vector::operator()(int i)
-	{
-		return v[i];
-	}
-
-	inline float &Vector::operator[](int i)
-	{
-		return v[i];
-	}
-
-	inline const float &Vector::operator()(int i) const
-	{
-		return v[i];
-	}
-
-	inline const float &Vector::operator[](int i) const
-	{
-		return v[i];
-	}
 }
 
+inline Vector::Vector(const int i)
+{
+	const float s = (float)i;
+
+	x = s;
+	y = s;
+	z = s;
+}
+
+inline Vector::Vector(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+}
+
+inline Vector::Vector(const Point &P)
+{
+	x = P.x;
+	y = P.y;
+	z = P.z;
+}
+
+inline Vector::Vector(float v_x, float v_y, float v_z)
+{
+	x = v_x;
+	y = v_y;
+	z = v_z;
+}
+
+inline Vector &Vector::operator=(const Vector &v)
+{
+	x = v.x;
+	y = v.y;
+	z = v.z;
+
+	return *this;
+}
+
+inline float &Vector::operator()(int i)
+{
+	return v[i];
+}
+
+inline float &Vector::operator[](int i)
+{
+	return v[i];
+}
+
+inline const float &Vector::operator()(int i) const
+{
+	return v[i];
+}
+
+inline const float &Vector::operator[](int i) const
+{
+	return v[i];
+}
+
+}  // namespace sw
+
 #endif   // Vector_hpp

diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 050a925..63af666 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp

@@ -19,42 +19,43 @@
 #include "System/Types.hpp"
 #include "Device/Config.hpp"
 
-namespace sw
+namespace sw {
+
+ALIGN(16, struct Vertex
 {
-	ALIGN(16, struct Vertex
+	union
 	{
-		union
+		struct
 		{
-			struct
-			{
-				float x;
-				float y;
-				float z;
-				float w;
-			};
-
-			float4 position;
-		};
-
-		float pointSize;
-
-		int clipFlags;
-		int cullMask;
-		float clipDistance[MAX_CLIP_DISTANCES];
-		float cullDistance[MAX_CLIP_DISTANCES];
-
-		alignas(16) struct
-		{
-			int x;
-			int y;
+			float x;
+			float y;
 			float z;
 			float w;
-		} projected;
+		};
 
-		alignas(16) float v[MAX_INTERFACE_COMPONENTS];
-	});
+		float4 position;
+	};
 
-	static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
-}
+	float pointSize;
+
+	int clipFlags;
+	int cullMask;
+	float clipDistance[MAX_CLIP_DISTANCES];
+	float cullDistance[MAX_CLIP_DISTANCES];
+
+	alignas(16) struct
+	{
+		int x;
+		int y;
+		float z;
+		float w;
+	} projected;
+
+	alignas(16) float v[MAX_INTERFACE_COMPONENTS];
+});
+
+static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
+
+}  // namespace sw
 
 #endif   // Vertex_hpp

diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index c6b96e1..e77b2f7 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp

@@ -21,124 +21,125 @@
 
 #include <cstring>
 
-namespace sw
+namespace sw {
+
+void VertexCache::clear()
 {
-	void VertexCache::clear()
+	for(uint32_t i = 0; i < SIZE; i++)
 	{
-		for(uint32_t i = 0; i < SIZE; i++)
-		{
-			tag[i] = 0xFFFFFFFF;
-		}
-	}
-
-	uint32_t VertexProcessor::States::computeHash()
-	{
-		uint32_t *state = reinterpret_cast<uint32_t*>(this);
-		uint32_t hash = 0;
-
-		for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
-		{
-			hash ^= state[i];
-		}
-
-		return hash;
-	}
-
-	unsigned int VertexProcessor::States::Input::bytesPerAttrib() const
-	{
-		switch(type)
-		{
-		case STREAMTYPE_FLOAT:
-		case STREAMTYPE_INT:
-		case STREAMTYPE_UINT:
-			return count * sizeof(uint32_t);
-		case STREAMTYPE_HALF:
-		case STREAMTYPE_SHORT:
-		case STREAMTYPE_USHORT:
-			return count * sizeof(uint16_t);
-		case STREAMTYPE_BYTE:
-		case STREAMTYPE_SBYTE:
-			return count * sizeof(uint8_t);
-		case STREAMTYPE_COLOR:
-		case STREAMTYPE_2_10_10_10_INT:
-		case STREAMTYPE_2_10_10_10_UINT:
-			return sizeof(int);
-		default:
-			UNSUPPORTED("stream.type %d", int(type));
-		}
-
-		return 0;
-	}
-
-	bool VertexProcessor::State::operator==(const State &state) const
-	{
-		if(hash != state.hash)
-		{
-			return false;
-		}
-
-		static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
-		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
-	}
-
-	VertexProcessor::VertexProcessor()
-	{
-		routineCache = nullptr;
-		setRoutineCacheSize(1024);
-	}
-
-	VertexProcessor::~VertexProcessor()
-	{
-		delete routineCache;
-		routineCache = nullptr;
-	}
-
-	void VertexProcessor::setRoutineCacheSize(int cacheSize)
-	{
-		delete routineCache;
-		routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
-	}
-
-	const VertexProcessor::State VertexProcessor::update(const sw::Context* context)
-	{
-		State state;
-
-		state.shaderID = context->vertexShader->getSerialID();
-		state.robustBufferAccess = context->robustBufferAccess;
-		state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
-
-		for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
-		{
-			state.input[i].type = context->input[i].type;
-			state.input[i].count = context->input[i].count;
-			state.input[i].normalized = context->input[i].normalized;
-			// TODO: get rid of attribType -- just keep the VK format all the way through, this fully determines
-			// how to handle the attribute.
-			state.input[i].attribType = context->vertexShader->inputs[i*4].Type;
-		}
-
-		state.hash = state.computeHash();
-
-		return state;
-	}
-
-	VertexProcessor::RoutineType VertexProcessor::routine(const State &state,
-	                                                      vk::PipelineLayout const *pipelineLayout,
-	                                                      SpirvShader const *vertexShader,
-	                                                      const vk::DescriptorSet::Bindings &descriptorSets)
-	{
-		auto routine = routineCache->query(state);
-
-		if(!routine)   // Create one
-		{
-			VertexRoutine *generator = new VertexProgram(state, pipelineLayout, vertexShader, descriptorSets);
-			generator->generate();
-			routine = (*generator)("VertexRoutine_%0.8X", state.shaderID);
-			delete generator;
-
-			routineCache->add(state, routine);
-		}
-
-		return routine;
+		tag[i] = 0xFFFFFFFF;
 	}
 }
+
+uint32_t VertexProcessor::States::computeHash()
+{
+	uint32_t *state = reinterpret_cast<uint32_t*>(this);
+	uint32_t hash = 0;
+
+	for(unsigned int i = 0; i < sizeof(States) / sizeof(uint32_t); i++)
+	{
+		hash ^= state[i];
+	}
+
+	return hash;
+}
+
+unsigned int VertexProcessor::States::Input::bytesPerAttrib() const
+{
+	switch(type)
+	{
+	case STREAMTYPE_FLOAT:
+	case STREAMTYPE_INT:
+	case STREAMTYPE_UINT:
+		return count * sizeof(uint32_t);
+	case STREAMTYPE_HALF:
+	case STREAMTYPE_SHORT:
+	case STREAMTYPE_USHORT:
+		return count * sizeof(uint16_t);
+	case STREAMTYPE_BYTE:
+	case STREAMTYPE_SBYTE:
+		return count * sizeof(uint8_t);
+	case STREAMTYPE_COLOR:
+	case STREAMTYPE_2_10_10_10_INT:
+	case STREAMTYPE_2_10_10_10_UINT:
+		return sizeof(int);
+	default:
+		UNSUPPORTED("stream.type %d", int(type));
+	}
+
+	return 0;
+}
+
+bool VertexProcessor::State::operator==(const State &state) const
+{
+	if(hash != state.hash)
+	{
+		return false;
+	}
+
+	static_assert(is_memcmparable<State>::value, "Cannot memcmp States");
+	return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+}
+
+VertexProcessor::VertexProcessor()
+{
+	routineCache = nullptr;
+	setRoutineCacheSize(1024);
+}
+
+VertexProcessor::~VertexProcessor()
+{
+	delete routineCache;
+	routineCache = nullptr;
+}
+
+void VertexProcessor::setRoutineCacheSize(int cacheSize)
+{
+	delete routineCache;
+	routineCache = new RoutineCacheType(clamp(cacheSize, 1, 65536));
+}
+
+const VertexProcessor::State VertexProcessor::update(const sw::Context* context)
+{
+	State state;
+
+	state.shaderID = context->vertexShader->getSerialID();
+	state.robustBufferAccess = context->robustBufferAccess;
+	state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
+
+	for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
+	{
+		state.input[i].type = context->input[i].type;
+		state.input[i].count = context->input[i].count;
+		state.input[i].normalized = context->input[i].normalized;
+		// TODO: get rid of attribType -- just keep the VK format all the way through, this fully determines
+		// how to handle the attribute.
+		state.input[i].attribType = context->vertexShader->inputs[i*4].Type;
+	}
+
+	state.hash = state.computeHash();
+
+	return state;
+}
+
+VertexProcessor::RoutineType VertexProcessor::routine(const State &state,
+                                                      vk::PipelineLayout const *pipelineLayout,
+                                                      SpirvShader const *vertexShader,
+                                                      const vk::DescriptorSet::Bindings &descriptorSets)
+{
+	auto routine = routineCache->query(state);
+
+	if(!routine)   // Create one
+	{
+		VertexRoutine *generator = new VertexProgram(state, pipelineLayout, vertexShader, descriptorSets);
+		generator->generate();
+		routine = (*generator)("VertexRoutine_%0.8X", state.shaderID);
+		delete generator;
+
+		routineCache->add(state, routine);
+	}
+
+	return routine;
+}
+
+}  // namespace sw

diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 62a65d1..c94e82e 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp

@@ -22,91 +22,92 @@
 #include "Vertex.hpp"
 #include "Pipeline/SpirvShader.hpp"
 
-namespace sw
+namespace sw {
+
+struct DrawData;
+
+// Basic direct mapped vertex cache.
+struct VertexCache
 {
-	struct DrawData;
+	static constexpr uint32_t SIZE = 64;  // TODO: Variable size?
+	static constexpr uint32_t TAG_MASK = SIZE - 1;  // Size must be power of 2.
 
-	// Basic direct mapped vertex cache.
-	struct VertexCache
+	void clear();
+
+	Vertex vertex[SIZE];
+	uint32_t tag[SIZE];
+
+	// Identifier of the draw call for the cache data. If this cache is
+	// used with a different draw call, then the cache should be invalidated
+	// before use.
+	int drawCall = -1;
+};
+
+struct VertexTask
+{
+	unsigned int vertexCount;
+	unsigned int primitiveStart;
+	VertexCache vertexCache;
+};
+
+using VertexRoutineFunction = FunctionT<void(Vertex* output, unsigned int* batch, VertexTask* vertextask, DrawData* draw)>;
+
+class VertexProcessor
+{
+public:
+	struct States : Memset<States>
 	{
-		static constexpr uint32_t SIZE = 64;  // TODO: Variable size?
-		static constexpr uint32_t TAG_MASK = SIZE - 1;  // Size must be power of 2.
+		States() : Memset(this, 0) {}
 
-		void clear();
+		uint32_t computeHash();
 
-		Vertex vertex[SIZE];
-		uint32_t tag[SIZE];
+		uint64_t shaderID;
 
-		// Identifier of the draw call for the cache data. If this cache is
-		// used with a different draw call, then the cache should be invalidated
-		// before use.
-		int drawCall = -1;
-	};
-
-	struct VertexTask
-	{
-		unsigned int vertexCount;
-		unsigned int primitiveStart;
-		VertexCache vertexCache;
-	};
-
-	using VertexRoutineFunction = FunctionT<void(Vertex* output, unsigned int* batch, VertexTask* vertextask, DrawData* draw)>;
-
-	class VertexProcessor
-	{
-	public:
-		struct States : Memset<States>
+		struct Input
 		{
-			States() : Memset(this, 0) {}
-
-			uint32_t computeHash();
-
-			uint64_t shaderID;
-
-			struct Input
+			operator bool() const   // Returns true if stream contains data
 			{
-				operator bool() const   // Returns true if stream contains data
-				{
-					return count != 0;
-				}
+				return count != 0;
+			}
 
-				unsigned int bytesPerAttrib() const;
+			unsigned int bytesPerAttrib() const;
 
-				StreamType type    : BITS(STREAMTYPE_LAST);
-				unsigned int count : 3;
-				bool normalized    : 1;
-				unsigned int attribType : BITS(SpirvShader::ATTRIBTYPE_LAST);
-			};
-
-			Input input[MAX_INTERFACE_COMPONENTS / 4];
-			bool robustBufferAccess : 1;
-			bool isPoint : 1;
+			StreamType type    : BITS(STREAMTYPE_LAST);
+			unsigned int count : 3;
+			bool normalized    : 1;
+			unsigned int attribType : BITS(SpirvShader::ATTRIBTYPE_LAST);
 		};
 
-		struct State : States
-		{
-			bool operator==(const State &state) const;
-
-			uint32_t hash;
-		};
-
-		using RoutineType = VertexRoutineFunction::RoutineType;
-
-		VertexProcessor();
-
-		virtual ~VertexProcessor();
-
-	protected:
-		const State update(const sw::Context* context);
-		RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
-		                                 SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
-
-		void setRoutineCacheSize(int cacheSize);
-
-	private:
-		using RoutineCacheType = RoutineCacheT<State, VertexRoutineFunction::CFunctionType>;
-		RoutineCacheType *routineCache;
+		Input input[MAX_INTERFACE_COMPONENTS / 4];
+		bool robustBufferAccess : 1;
+		bool isPoint : 1;
 	};
-}
+
+	struct State : States
+	{
+		bool operator==(const State &state) const;
+
+		uint32_t hash;
+	};
+
+	using RoutineType = VertexRoutineFunction::RoutineType;
+
+	VertexProcessor();
+
+	virtual ~VertexProcessor();
+
+protected:
+	const State update(const sw::Context* context);
+	RoutineType routine(const State &state, vk::PipelineLayout const *pipelineLayout,
+	                    SpirvShader const *vertexShader, const vk::DescriptorSet::Bindings &descriptorSets);
+
+	void setRoutineCacheSize(int cacheSize);
+
+private:
+	using RoutineCacheType = RoutineCacheT<State, VertexRoutineFunction::CFunctionType>;
+	RoutineCacheType *routineCache;
+};
+
+}  // namespace sw
 
 #endif   // sw_VertexProcessor_hpp